diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c22b27abdbf6c..6e747dfee644a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -955,6 +955,10 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, } unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { + if (auto *Arg = dyn_cast(V); + Arg && AMDGPU::isKernelCC(Arg->getParent()) && !Arg->hasByRefAttr()) + return AMDGPUAS::GLOBAL_ADDRESS; + const auto *LD = dyn_cast(V); if (!LD) // TODO: Handle invariant load like constant. return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index a72c1d329e199..5e52f48b1ec50 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -12593,29 +12593,18 @@ struct AAAddressSpaceImpl : public AAAddressSpace { } ChangeStatus updateImpl(Attributor &A) override { - unsigned FlatAS = A.getInfoCache().getFlatAddressSpace().value(); uint32_t OldAddressSpace = AssumedAddressSpace; auto CheckAddressSpace = [&](Value &Obj) { if (isa(&Obj)) return true; - // If an argument in flat address space only has addrspace cast uses, and - // those casts are same, then we take the dst addrspace. if (auto *Arg = dyn_cast(&Obj)) { - if (Arg->getType()->getPointerAddressSpace() == FlatAS) { - unsigned CastAddrSpace = FlatAS; - for (auto *U : Arg->users()) { - auto *ASCI = dyn_cast(U); - if (!ASCI) - return takeAddressSpace(Obj.getType()->getPointerAddressSpace()); - if (CastAddrSpace != FlatAS && - CastAddrSpace != ASCI->getDestAddressSpace()) - return false; - CastAddrSpace = ASCI->getDestAddressSpace(); - } - if (CastAddrSpace != FlatAS) - return takeAddressSpace(CastAddrSpace); - } + auto *TTI = + A.getInfoCache().getAnalysisResultForFunction( + *Arg->getParent()); + unsigned AssumedAS = TTI->getAssumedAddrSpace(Arg); + if (AssumedAS != ~0U) + return takeAddressSpace(AssumedAS); } return takeAddressSpace(Obj.getType()->getPointerAddressSpace()); }; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index aeb301939e986..0347d9e5c3110 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -946,7 +946,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -964,7 +964,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -974,52 +974,38 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_ret_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr %out, align 4 @@ -1040,7 +1026,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1060,7 +1046,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1070,54 +1056,38 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s2, s2, 16 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 @@ -1139,7 +1109,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1159,7 +1129,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1169,54 +1139,38 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s2, s2, 16 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4 @@ -1236,7 +1190,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_atomic_dec v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -1251,37 +1205,28 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_atomic_dec v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_atomic_dec v[0:1], v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_atomic_dec v[0:1], v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1290,11 +1235,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; GFX11-LABEL: flat_atomic_dec_noret_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -1317,7 +1260,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_atomic_dec v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -1334,39 +1277,28 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_atomic_dec v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 16 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_atomic_dec v[0:1], v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1375,11 +1307,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -1403,7 +1333,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_atomic_dec v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -1420,39 +1350,28 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_atomic_dec v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 16 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_atomic_dec v[0:1], v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1461,11 +1380,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -1492,7 +1409,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1517,7 +1434,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1529,72 +1446,40 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 42 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_atomic_dec v3, v[0:1], v3 offset:20 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[2:3] offset:20 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 42 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: flat_atomic_dec v3, v[0:1], v3 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec v1, v0, v1, s[2:3] offset:20 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: flat_store_dword v[0:1], v3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: flat_atomic_dec_u32 v3, v[0:1], v3 offset:20 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u32 v1, v0, v1, s[2:3] offset:20 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: flat_store_b32 v[0:1], v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id @@ -1622,7 +1507,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: flat_atomic_dec v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -1642,45 +1527,28 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: flat_atomic_dec v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:20 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v2, 42 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: flat_atomic_dec v[0:1], v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec v0, v1, s[0:1] offset:20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1689,17 +1557,11 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:20 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[0:1] offset:20 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -1724,16 +1586,11 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_add_u32 s0, s0, 4 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: flat_store_dword v[2:3], v0 -; CI-NEXT: flat_store_dword v[4:5], v1 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i64: @@ -1748,70 +1605,51 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_u32 s0, s0, 4 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dword v[2:3], v0 -; VI-NEXT: flat_store_dword v[4:5], v1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_ret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_ret_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 @@ -1833,16 +1671,11 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_add_u32 s0, s0, 4 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: flat_store_dword v[2:3], v0 -; CI-NEXT: flat_store_dword v[4:5], v1 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset: @@ -1859,72 +1692,51 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_u32 s0, s0, 4 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dword v[2:3], v0 -; VI-NEXT: flat_store_dword v[4:5], v1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s2, s2, 32 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 @@ -1945,7 +1757,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -1961,39 +1773,30 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2003,11 +1806,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -2031,7 +1832,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -2049,41 +1850,30 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 32 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2093,11 +1883,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -2122,7 +1910,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -2140,41 +1928,30 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 32 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2184,11 +1961,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -2216,16 +1991,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: v_add_i32_e32 v4, vcc, 4, v2 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; CI-NEXT: flat_store_dword v[2:3], v0 -; CI-NEXT: flat_store_dword v[4:5], v1 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: @@ -2245,90 +2017,55 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; VI-NEXT: flat_store_dword v[2:3], v0 -; VI-NEXT: flat_store_dword v[4:5], v1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:40 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id @@ -2357,7 +2094,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -2378,47 +2115,30 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v0, v[1:2], s[0:1] offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_dec_x2 v0, v[1:2], s[0:1] offset:40 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2427,17 +2147,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:40 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:40 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index cbadd1eb431fc..fc940bbe09b34 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -2326,7 +2326,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2344,7 +2344,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2354,65 +2354,49 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_inc_ret_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr %out, align 4 @@ -2433,7 +2417,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2453,7 +2437,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2463,67 +2447,49 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s2, s2, 16 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 @@ -2545,7 +2511,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2565,7 +2531,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2575,69 +2541,51 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s2, s2, 16 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 @@ -2657,7 +2605,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_atomic_inc v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -2672,37 +2620,28 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_atomic_inc v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_atomic_inc v[0:1], v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_atomic_inc v[0:1], v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v1, v0, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2711,11 +2650,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; GFX11-LABEL: flat_atomic_inc_noret_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -2724,11 +2661,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; GFX12-LABEL: flat_atomic_inc_noret_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 @@ -2749,7 +2685,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_atomic_inc v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -2766,39 +2702,28 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_atomic_inc v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 16 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_atomic_inc v[0:1], v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2807,11 +2732,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -2820,11 +2743,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX12-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 @@ -2846,7 +2768,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_atomic_inc v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -2863,39 +2785,28 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_atomic_inc v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 16 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_atomic_inc v[0:1], v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2904,11 +2815,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -2917,13 +2826,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 @@ -2948,7 +2856,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2973,7 +2881,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2985,93 +2893,53 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 42 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_atomic_inc v3, v[0:1], v3 offset:20 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 42 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: flat_atomic_inc v3, v[0:1], v3 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: flat_store_dword v[0:1], v3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: flat_store_b32 v[0:1], v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: flat_store_b32 v[0:1], v3 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id @@ -3099,7 +2967,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: flat_atomic_inc v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -3119,45 +2987,28 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: flat_atomic_inc v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:20 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v2, 42 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: flat_atomic_inc v[0:1], v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v0, v1, s[0:1] offset:20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -3166,17 +3017,11 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:20 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3185,17 +3030,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_mov_b32_e32 v2, 42 -; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:20 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -3340,16 +3180,11 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_add_u32 s0, s0, 4 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: flat_store_dword v[2:3], v0 -; CI-NEXT: flat_store_dword v[4:5], v1 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i64: @@ -3364,84 +3199,63 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_u32 s0, s0, 4 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dword v[2:3], v0 -; VI-NEXT: flat_store_dword v[4:5], v1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_inc_ret_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 -; GFX12-NEXT: v_mov_b32_e32 v3, s3 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 @@ -3463,16 +3277,11 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_add_u32 s0, s0, 4 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: flat_store_dword v[2:3], v0 -; CI-NEXT: flat_store_dword v[4:5], v1 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset: @@ -3489,86 +3298,63 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_u32 s0, s0, 4 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dword v[2:3], v0 -; VI-NEXT: flat_store_dword v[4:5], v1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s2, s2, 32 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 -; GFX12-NEXT: v_mov_b32_e32 v3, s3 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 @@ -3591,16 +3377,11 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_add_u32 s0, s0, 4 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: flat_store_dword v[2:3], v0 -; CI-NEXT: flat_store_dword v[4:5], v1 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system: @@ -3617,88 +3398,65 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_u32 s0, s0, 4 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dword v[2:3], v0 -; VI-NEXT: flat_store_dword v[4:5], v1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s2, s2, 32 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 -; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 @@ -3719,7 +3477,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -3735,39 +3493,30 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -3777,11 +3526,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3791,11 +3538,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 @@ -3817,7 +3563,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -3835,41 +3581,30 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 32 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -3879,11 +3614,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3893,11 +3626,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 @@ -3920,7 +3652,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -3938,41 +3670,30 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 32 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -3982,11 +3703,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3996,13 +3715,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 @@ -4028,16 +3746,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: v_add_i32_e32 v4, vcc, 4, v2 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; CI-NEXT: flat_store_dword v[2:3], v0 -; CI-NEXT: flat_store_dword v[4:5], v1 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: @@ -4057,112 +3772,69 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; VI-NEXT: flat_store_dword v[2:3], v0 -; VI-NEXT: flat_store_dword v[4:5], v1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id @@ -4191,7 +3863,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -4212,47 +3884,30 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v0, v[1:2], s[0:1] offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s12, s12, s17 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v0, v[1:2], s[0:1] offset:40 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -4261,17 +3916,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40 +; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:40 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -4280,17 +3930,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX12-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:40 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll index 6792612ded368..acf1a754c5a61 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll @@ -1,34 +1,30 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefix=GFX942 -define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { +define void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX942-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: s_endpgm +; GFX942-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { +define void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX942-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: s_endpgm +; GFX942-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 63009bdc2643f..a4b2d5613df60 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1334,92 +1334,82 @@ main_body: ret void } -define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { +define void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: s_endpgm +; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX942: ; %bb.0: ; %main_body -; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: s_endpgm +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } -define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { +define void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: s_endpgm +; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX942: ; %bb.0: ; %main_body -; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: s_endpgm +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } -define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { +define void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: s_endpgm +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX942: ; %bb.0: ; %main_body -; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: s_endpgm +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void @@ -1506,30 +1496,26 @@ main_body: ret double %ret } -define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { +define void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: s_endpgm +; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX942: ; %bb.0: ; %main_body -; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: s_endpgm +; GFX942-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 86766e2904619..fa07f0a331456 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -124,50 +124,28 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 -; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 -; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v0, 0 ; GFX8V4-NEXT: flat_store_dword v[0:1], v0 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: s_endpgm ; ; GFX8V5-LABEL: llvm_amdgcn_is_shared: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc -; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 -; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: v_mov_b32_e32 v0, 0 ; GFX8V5-NEXT: flat_store_dword v[0:1], v0 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: s_endpgm ; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9V4-NEXT: s_mov_b64 s[2:3], src_shared_base -; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3 -; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V4-NEXT: v_mov_b32_e32 v0, 0 ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off ; GFX9V4-NEXT: s_waitcnt vmcnt(0) ; GFX9V4-NEXT: s_endpgm ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9V5-NEXT: s_mov_b64 s[2:3], src_shared_base -; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 -; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V5-NEXT: v_mov_b32_e32 v0, 0 ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off ; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: s_endpgm @@ -180,50 +158,28 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 -; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 -; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v0, 0 ; GFX8V4-NEXT: flat_store_dword v[0:1], v0 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: s_endpgm ; ; GFX8V5-LABEL: llvm_amdgcn_is_private: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8 -; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 -; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: v_mov_b32_e32 v0, 0 ; GFX8V5-NEXT: flat_store_dword v[0:1], v0 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: s_endpgm ; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3 -; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V4-NEXT: v_mov_b32_e32 v0, 0 ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off ; GFX9V4-NEXT: s_waitcnt vmcnt(0) ; GFX9V4-NEXT: s_endpgm ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 -; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V5-NEXT: v_mov_b32_e32 v0, 0 ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off ; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index c862335764dd4..9f9a2f37eacdd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -629,29 +629,22 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 -; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 -; GFX1030-NEXT: v_mov_b32_e32 v7, 0x40a00000 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40c00000 -; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 -; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 +; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1030-NEXT: v_mov_b32_e32 v3, 0 +; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0 +; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000 +; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000 +; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 +; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s0 -; GFX1030-NEXT: v_mov_b32_e32 v1, s1 -; GFX1030-NEXT: v_mov_b32_e32 v2, s2 -; GFX1030-NEXT: v_mov_b32_e32 v3, s3 -; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1030-NEXT: flat_load_dword v0, v[0:1] -; GFX1030-NEXT: flat_load_dword v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] +; GFX1030-NEXT: s_clause 0x1 +; GFX1030-NEXT: global_load_dword v1, v0, s[0:1] +; GFX1030-NEXT: global_load_dword v2, v0, s[2:3] +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[1:11], s[4:7] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm @@ -659,28 +652,21 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 +; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s8 -; GFX1013-NEXT: v_mov_b32_e32 v1, s9 -; GFX1013-NEXT: v_mov_b32_e32 v2, s10 -; GFX1013-NEXT: v_mov_b32_e32 v3, s11 -; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 -; GFX1013-NEXT: flat_load_dword v0, v[4:5] -; GFX1013-NEXT: flat_load_dword v1, v[2:3] +; GFX1013-NEXT: s_clause 0x1 +; GFX1013-NEXT: global_load_dword v0, v2, s[8:9] +; GFX1013-NEXT: global_load_dword v1, v2, s[10:11] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 -; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[12:15] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] @@ -694,29 +680,23 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: s_mov_b32 s9, 4.0 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 -; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 +; GFX11-NEXT: v_mov_b32_e32 v4, s9 +; GFX11-NEXT: v_mov_b32_e32 v8, s14 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_mov_b32 s2, 2.0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v9, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v10, v0, s[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: flat_load_b32 v9, v[0:1] -; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v7, s13 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -744,26 +724,19 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 -; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700 +; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1030-NEXT: v_mov_b32_e32 v3, 0 +; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 +; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 +; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s0 -; GFX1030-NEXT: v_mov_b32_e32 v1, s1 -; GFX1030-NEXT: v_mov_b32_e32 v2, s2 -; GFX1030-NEXT: v_mov_b32_e32 v3, s3 -; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1030-NEXT: flat_load_dword v0, v[0:1] -; GFX1030-NEXT: flat_load_dword v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 +; GFX1030-NEXT: s_clause 0x1 +; GFX1030-NEXT: global_load_dword v1, v0, s[0:1] +; GFX1030-NEXT: global_load_dword v2, v0, s[2:3] +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[1:8], s[4:7] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm @@ -771,25 +744,18 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 -; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s8 -; GFX1013-NEXT: v_mov_b32_e32 v1, s9 -; GFX1013-NEXT: v_mov_b32_e32 v2, s10 -; GFX1013-NEXT: v_mov_b32_e32 v3, s11 -; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 -; GFX1013-NEXT: flat_load_dword v0, v[4:5] -; GFX1013-NEXT: flat_load_dword v1, v[2:3] -; GFX1013-NEXT: v_mov_b32_e32 v2, 0 +; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 -; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 +; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 +; GFX1013-NEXT: s_waitcnt lgkmcnt(0) +; GFX1013-NEXT: s_clause 0x1 +; GFX1013-NEXT: global_load_dword v0, v2, s[8:9] +; GFX1013-NEXT: global_load_dword v1, v2, s[10:11] +; GFX1013-NEXT: v_mov_b32_e32 v2, 0 +; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] @@ -803,25 +769,18 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, s9 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_mov_b32 s2, 2.0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v6, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v7, v0, s[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: flat_load_b32 v6, v[0:1] -; GFX11-NEXT: flat_load_b32 v7, v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -846,77 +805,44 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ } define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { -; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000 -; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000 -; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 -; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 -; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s6 -; GFX1030-NEXT: v_mov_b32_e32 v1, s7 -; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX1030-NEXT: flat_load_dword v2, v[0:1] -; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 -; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 -; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] -; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] -; GFX1030-NEXT: s_endpgm -; -; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: -; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0 -; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000 -; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000 -; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 -; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 -; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s6 -; GFX1013-NEXT: v_mov_b32_e32 v1, s7 -; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: flat_load_dword v2, v[0:1] -; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 -; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 -; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] -; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] -; GFX1013-NEXT: s_endpgm +; GFX10-LABEL: image_bvh64_intersect_ray_nsa_reassign: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX10-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX10-NEXT: v_mov_b32_e32 v7, 4.0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0x40a00000 +; GFX10-NEXT: v_mov_b32_e32 v9, 0x40c00000 +; GFX10-NEXT: v_mov_b32_e32 v10, 0x40e00000 +; GFX10-NEXT: v_mov_b32_e32 v11, 0x41000000 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: v_mov_b32_e32 v0, 0xb36211c7 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s5, 1.0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v9, 0xb36211c7 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_mov_b32 s9, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 @@ -924,17 +850,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX11-NEXT: v_bfrev_b32_e32 v10, 4.0 ; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 +; GFX11-NEXT: v_mov_b32_e32 v5, s10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: global_load_b32 v11, v0, s[6:7] ; GFX11-NEXT: s_mov_b32 s6, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v11, v[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -957,84 +879,50 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 } define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { -; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 -; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 -; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s6 -; GFX1030-NEXT: v_mov_b32_e32 v1, s7 -; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX1030-NEXT: flat_load_dword v2, v[0:1] -; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 -; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 -; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] -; GFX1030-NEXT: s_endpgm -; -; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: -; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200 -; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 -; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 -; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s6 -; GFX1013-NEXT: v_mov_b32_e32 v1, s7 -; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: flat_load_dword v2, v[0:1] -; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 -; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 -; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] -; GFX1013-NEXT: s_endpgm +; GFX10-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX10-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0x44004200 +; GFX10-NEXT: v_mov_b32_e32 v7, 0x46004500 +; GFX10-NEXT: v_mov_b32_e32 v8, 0x48004700 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: v_mov_b32_e32 v0, 0xb36211c6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s5, 1.0 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0 -; GFX11-NEXT: v_mov_b32_e32 v3, s8 -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_mov_b32_e32 v4, s9 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: global_load_b32 v8, v0, s[6:7] ; GFX11-NEXT: s_mov_b32 s6, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v8, v[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll index d1a6414fe49ae..cc2c80060231c 100644 --- a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll +++ b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll @@ -246,8 +246,7 @@ define void @foo(ptr addrspace(3) %val) { define void @kernel_argument_promotion_pattern_intra_procedure(ptr %p, i32 %val) { ; CHECK-LABEL: define void @kernel_argument_promotion_pattern_intra_procedure( ; CHECK-SAME: ptr [[P:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[P_CAST_0:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) -; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[P_CAST_0]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr [[P]], align 4 ; CHECK-NEXT: ret void ; %p.cast.0 = addrspacecast ptr %p to ptr addrspace(1) @@ -259,8 +258,7 @@ define void @kernel_argument_promotion_pattern_intra_procedure(ptr %p, i32 %val) define internal void @use_argument_after_promotion(ptr %p, i32 %val) { ; CHECK-LABEL: define internal void @use_argument_after_promotion( ; CHECK-SAME: ptr [[P:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) -; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[TMP1]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr [[P]], align 4 ; CHECK-NEXT: ret void ; store i32 %val, ptr %p diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll index c31b2ceed6688..a0b3964756eb0 100644 --- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll +++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll @@ -47,11 +47,10 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %else ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB2_2: ; %then ; GFX9-NEXT: s_endpgm @@ -64,11 +63,10 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; GFX10-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX10-NEXT: ; %bb.1: ; %else ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; GFX10-NEXT: .LBB2_2: ; %then ; GFX10-NEXT: s_endpgm @@ -81,10 +79,9 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; GFX11-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB2_2: ; %then ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index e74fd21365c9d..e753d81a7ab61 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -63,10 +63,10 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-SDAG-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32: @@ -77,10 +77,10 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %addr, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index 986dd8a046424..ab7e1bc195f97 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -31,88 +31,62 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; GFX9-LABEL: cluster_load_cluster_store: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dword v2, v[0:1] -; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:8 -; GFX9-NEXT: flat_load_dword v4, v[0:1] offset:16 -; GFX9-NEXT: flat_load_dword v5, v[0:1] offset:24 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_dword v[0:1], v2 -; GFX9-NEXT: flat_store_dword v[0:1], v3 offset:8 -; GFX9-NEXT: flat_store_dword v[0:1], v4 offset:16 -; GFX9-NEXT: flat_store_dword v[0:1], v5 offset:24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x8 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x18 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v2, s[2:3] offset:8 +; GFX9-NEXT: global_store_dword v0, v3, s[2:3] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] offset:24 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cluster_load_cluster_store: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s4, s0, 8 -; GFX10-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-NEXT: s_add_u32 s6, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_addc_u32 s7, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_add_u32 s0, s0, 24 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-NEXT: v_mov_b32_e32 v5, s7 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 -; GFX10-NEXT: v_mov_b32_e32 v6, s0 ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: flat_load_dword v8, v[0:1] -; GFX10-NEXT: flat_load_dword v9, v[2:3] -; GFX10-NEXT: flat_load_dword v10, v[4:5] -; GFX10-NEXT: flat_load_dword v11, v[6:7] -; GFX10-NEXT: s_add_u32 s0, s2, 8 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 16 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_add_u32 s2, s2, 24 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s1 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; GFX10-NEXT: flat_store_dword v[0:1], v8 -; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) -; GFX10-NEXT: flat_store_dword v[2:3], v9 -; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) -; GFX10-NEXT: flat_store_dword v[4:5], v10 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) -; GFX10-NEXT: flat_store_dword v[6:7], v11 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x8 +; GFX10-NEXT: s_load_dword s6, s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s7, s[0:1], 0x18 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: v_mov_b32_e32 v4, s7 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: global_store_dword v0, v2, s[2:3] offset:8 +; GFX10-NEXT: global_store_dword v0, v3, s[2:3] offset:16 +; GFX10-NEXT: global_store_dword v0, v4, s[2:3] offset:24 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: cluster_load_cluster_store: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:8 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; GFX11-NEXT: flat_store_b32 v[0:1], v2 -; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) -; GFX11-NEXT: flat_store_b32 v[0:1], v3 offset:8 -; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) -; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) -; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x18 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_mov_b32_e32 v4, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: global_store_b32 v0, v2, s[2:3] offset:8 +; GFX11-NEXT: global_store_b32 v0, v3, s[2:3] offset:16 +; GFX11-NEXT: global_store_b32 v0, v4, s[2:3] offset:24 ; GFX11-NEXT: s_endpgm bb: %ld0 = load i32, ptr %lb @@ -156,92 +130,66 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; GFX9-LABEL: cluster_load_valu_cluster_store: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x8 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x18 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_add_i32 s1, s5, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dword v2, v[0:1] -; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:8 -; GFX9-NEXT: flat_load_dword v4, v[0:1] offset:16 -; GFX9-NEXT: flat_load_dword v5, v[0:1] offset:24 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_dword v[0:1], v2 -; GFX9-NEXT: v_add_u32_e32 v2, 1, v3 -; GFX9-NEXT: flat_store_dword v[0:1], v4 offset:16 -; GFX9-NEXT: flat_store_dword v[0:1], v2 offset:8 -; GFX9-NEXT: flat_store_dword v[0:1], v5 offset:24 +; GFX9-NEXT: global_store_dword v0, v2, s[2:3] offset:16 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] offset:24 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cluster_load_valu_cluster_store: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s4, s0, 8 -; GFX10-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_add_u32 s6, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_addc_u32 s7, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_add_u32 s0, s0, 24 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-NEXT: v_mov_b32_e32 v5, s7 -; GFX10-NEXT: flat_load_dword v6, v[2:3] -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: flat_load_dword v8, v[0:1] -; GFX10-NEXT: flat_load_dword v9, v[4:5] -; GFX10-NEXT: flat_load_dword v10, v[2:3] -; GFX10-NEXT: s_add_u32 s0, s2, 8 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: s_add_u32 s4, s2, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: s_addc_u32 s5, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 -; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 -; GFX10-NEXT: v_mov_b32_e32 v6, s0 -; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; GFX10-NEXT: flat_store_dword v[0:1], v8 -; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) -; GFX10-NEXT: flat_store_dword v[4:5], v9 -; GFX10-NEXT: flat_store_dword v[2:3], v11 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) -; GFX10-NEXT: flat_store_dword v[6:7], v10 +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x8 +; GFX10-NEXT: s_load_dword s6, s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s7, s[0:1], 0x18 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_add_i32 s0, s5, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s7 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: global_store_dword v0, v2, s[2:3] offset:16 +; GFX10-NEXT: global_store_dword v0, v3, s[2:3] offset:8 +; GFX10-NEXT: global_store_dword v0, v4, s[2:3] offset:24 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: cluster_load_valu_cluster_store: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:8 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v2 -; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: flat_store_b32 v[0:1], v3 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:8 -; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) -; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) -; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x18 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s1, s4, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: global_store_b32 v0, v2, s[2:3] offset:8 +; GFX11-NEXT: global_store_b32 v0, v3, s[2:3] offset:16 +; GFX11-NEXT: global_store_b32 v0, v4, s[2:3] offset:24 ; GFX11-NEXT: s_endpgm bb: %ld0 = load i32, ptr %lb diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll index 93b5f155fc81e..d357e32b09c0c 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -5,14 +5,11 @@ define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) { ; GCN-LABEL: vectorLoadCombine: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: global_load_dword v1, v0, s[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-NEXT: s_endpgm entry: %0 = load <4 x i8>, ptr %in, align 4 @@ -38,16 +35,21 @@ define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) { ; GCN-LABEL: vectorLoadShuffle: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: s_mov_b32 s0, 0x7050604 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, v2, s0 -; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GCN-NEXT: s_and_b32 s1, s0, 0xff +; GCN-NEXT: s_lshl_b32 s5, s0, 8 +; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_and_b32 s5, s5, 0xff0000 +; GCN-NEXT: s_or_b32 s1, s4, s1 +; GCN-NEXT: s_or_b32 s1, s1, s5 +; GCN-NEXT: s_and_b32 s0, s0, 0xff000000 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-NEXT: s_endpgm entry: %0 = load <4 x i8>, ptr %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll index cdf4a88814dfc..2f4f910238a80 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll @@ -36,20 +36,18 @@ define amdgpu_kernel void @flat_load_maybe_divergent(ptr addrspace(4) %k, ptr %f ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_add_i32 s12, s12, s17 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: flat_load_dword v0, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GCN-NEXT: flat_load_dword v0, v[0:1] -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i32 s3, s2, 31 +; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN-NEXT: s_add_u32 s0, s0, s2 +; GCN-NEXT: s_addc_u32 s1, s1, s3 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_endpgm %load = load i32, ptr %flat, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll index fc17d9288bf40..0a28bcb7fe64b 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -6,8 +6,6 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) { ; CHECK-LABEL: eggs: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 @@ -35,31 +33,17 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_mov_b32_e32 v7, 0 ; CHECK-NEXT: .LBB0_3: ; %bb41 +; CHECK-NEXT: v_mov_b32_e32 v8, 0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x48 -; CHECK-NEXT: v_mov_b32_e32 v8, s14 -; CHECK-NEXT: v_mov_b32_e32 v9, s15 -; CHECK-NEXT: v_mov_b32_e32 v10, s16 -; CHECK-NEXT: v_mov_b32_e32 v11, s17 -; CHECK-NEXT: v_mov_b32_e32 v12, s18 -; CHECK-NEXT: v_mov_b32_e32 v13, s19 -; CHECK-NEXT: v_mov_b32_e32 v14, s20 -; CHECK-NEXT: v_mov_b32_e32 v15, s21 -; CHECK-NEXT: v_mov_b32_e32 v16, s22 -; CHECK-NEXT: v_mov_b32_e32 v17, s23 -; CHECK-NEXT: v_mov_b32_e32 v18, s24 -; CHECK-NEXT: v_mov_b32_e32 v19, s25 -; CHECK-NEXT: v_mov_b32_e32 v20, s26 -; CHECK-NEXT: v_mov_b32_e32 v21, s27 -; CHECK-NEXT: flat_store_byte v[8:9], v7 -; CHECK-NEXT: flat_store_byte v[10:11], v6 -; CHECK-NEXT: flat_store_byte v[12:13], v5 -; CHECK-NEXT: flat_store_byte v[14:15], v4 -; CHECK-NEXT: flat_store_byte v[16:17], v3 -; CHECK-NEXT: flat_store_byte v[18:19], v2 -; CHECK-NEXT: flat_store_byte v[20:21], v1 +; CHECK-NEXT: global_store_byte v8, v7, s[14:15] +; CHECK-NEXT: global_store_byte v8, v6, s[16:17] +; CHECK-NEXT: global_store_byte v8, v5, s[18:19] +; CHECK-NEXT: global_store_byte v8, v4, s[20:21] +; CHECK-NEXT: global_store_byte v8, v3, s[22:23] +; CHECK-NEXT: global_store_byte v8, v2, s[24:25] +; CHECK-NEXT: global_store_byte v8, v1, s[26:27] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] -; CHECK-NEXT: flat_store_byte v[2:3], v0 +; CHECK-NEXT: global_store_byte v8, v0, s[0:1] ; CHECK-NEXT: s_endpgm bb: br i1 %arg, label %bb10, label %bb41 diff --git a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll index 1732dd0521e5f..e63e341768cf2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll @@ -81,46 +81,36 @@ define amdgpu_kernel void @load_i16_lo(ptr %arg, ptr %out) { ; GFX9-LABEL: load_i16_lo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_short_d16 v2, v[0:1] offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v2, v2 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_load_short_d16 v0, v0, s[0:1] offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: load_i16_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 8 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_short_d16 v2, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v2, v2, v2 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: global_load_short_d16 v0, v0, s[0:1] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v0, v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: load_i16_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_load_d16_b16 v2, v[0:1] offset:8 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v2, v2, v2 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: global_load_d16_b16 v0, v0, s[0:1] offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_endpgm %gep = getelementptr inbounds i16, ptr %arg, i32 4 %ld = load i16, ptr %gep, align 2 @@ -134,46 +124,36 @@ define amdgpu_kernel void @load_i16_hi(ptr %arg, ptr %out) { ; GFX9-LABEL: load_i16_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1] offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v2, v2 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_load_short_d16_hi v0, v0, s[0:1] offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: load_i16_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 8 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_short_d16_hi v2, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v2, v2, v2 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: global_load_short_d16_hi v0, v0, s[0:1] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v0, v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: load_i16_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_load_d16_hi_b16 v2, v[0:1] offset:8 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v2, v2, v2 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[0:1] offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_endpgm %gep = getelementptr inbounds i16, ptr %arg, i32 4 %ld = load i16, ptr %gep, align 2 @@ -187,46 +167,36 @@ define amdgpu_kernel void @load_half_lo(ptr %arg, ptr %out) { ; GFX9-LABEL: load_half_lo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_short_d16 v2, v[0:1] offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v2, v2, v2 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_load_short_d16 v0, v0, s[0:1] offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v0, v0, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: load_half_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 8 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_short_d16 v2, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v2, v2, v2 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: global_load_short_d16 v0, v0, s[0:1] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v0, v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: load_half_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_load_d16_b16 v2, v[0:1] offset:8 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v2, v2, v2 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: global_load_d16_b16 v0, v0, s[0:1] offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_endpgm %gep = getelementptr inbounds half, ptr %arg, i32 4 %ld = load half, ptr %gep, align 2 @@ -240,46 +210,36 @@ define amdgpu_kernel void @load_half_hi(ptr %arg, ptr %out) { ; GFX9-LABEL: load_half_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1] offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v2, v2, v2 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_load_short_d16_hi v0, v0, s[0:1] offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v0, v0, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: load_half_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 8 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_short_d16_hi v2, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v2, v2, v2 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: global_load_short_d16_hi v0, v0, s[0:1] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v0, v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: load_half_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_load_d16_hi_b16 v2, v[0:1] offset:8 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v2, v2, v2 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[0:1] offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_endpgm %gep = getelementptr inbounds half, ptr %arg, i32 4 %ld = load half, ptr %gep, align 2 @@ -293,43 +253,34 @@ define amdgpu_kernel void @load_float_lo(ptr %arg, ptr %out) { ; GFX9-LABEL: load_float_lo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v2, v2, v2 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e64 v1, s0, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: load_float_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 16 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_load_dword v2, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v2, v2, v2 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_f32_e64 v1, s0, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: load_float_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:16 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v2 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e64 v1, s0, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm %gep = getelementptr inbounds float, ptr %arg, i32 4 %ld = load float, ptr %gep, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index e674b57aae3ef..9459fa3bda4da 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -6,45 +6,72 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b64 s[0:1], exec +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN1-NEXT: s_cbranch_execz .LBB0_2 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GCN1-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_mul_i32 s0, s2, s0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_atomic_add v[0:1], v2 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: .LBB0_2: ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN2-NEXT: s_mov_b64 s[0:1], exec +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN2-NEXT: s_cbranch_execz .LBB0_2 +; GCN2-NEXT: ; %bb.1: +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_u32 s2, s2, 16 +; GCN2-NEXT: s_addc_u32 s3, s3, 0 +; GCN2-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN2-NEXT: s_mul_i32 s0, s4, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: .LBB0_2: ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_add_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN3-NEXT: s_mov_b64 s[0:1], exec +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN3-NEXT: s_cbranch_execz .LBB0_2 +; GCN3-NEXT: ; %bb.1: +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mul_i32 s0, s6, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s0 +; GCN3-NEXT: global_atomic_add v0, v1, s[2:3] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: .LBB0_2: ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -157,18 +184,20 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i32_ret_offset: @@ -197,12 +226,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -288,12 +316,14 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i32_ret_addr64_offset: @@ -330,12 +360,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -394,16 +423,18 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i32_ret: @@ -430,12 +461,11 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst @@ -513,12 +543,14 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i32_ret_addr64: @@ -553,12 +585,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -621,18 +652,20 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i32_ret_offset: @@ -661,12 +694,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -752,12 +784,14 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i32_ret_addr64_offset: @@ -794,12 +828,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -858,16 +891,18 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i32_ret: @@ -894,12 +929,11 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst @@ -977,12 +1011,14 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i32_ret_addr64: @@ -1017,12 +1053,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -1085,18 +1120,20 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i32_ret_offset: @@ -1125,12 +1162,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -1216,12 +1252,14 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset: @@ -1258,12 +1296,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -1322,16 +1359,18 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i32_ret: @@ -1358,12 +1397,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst @@ -1441,12 +1479,14 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64: @@ -1481,12 +1521,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -1546,18 +1585,20 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_smax v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_offset: @@ -1586,12 +1627,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -1674,12 +1714,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_smax v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: @@ -1716,12 +1758,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -1777,16 +1818,18 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_smax v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret: @@ -1813,12 +1856,11 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_smax v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -1893,12 +1935,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_smax v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: @@ -1933,12 +1977,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_smax v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -1998,18 +2041,20 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_umax v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_offset: @@ -2038,12 +2083,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -2126,12 +2170,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_umax v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: @@ -2168,12 +2214,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -2229,16 +2274,18 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_umax v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret: @@ -2265,12 +2312,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_umax v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -2345,12 +2391,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_umax v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: @@ -2385,12 +2433,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_umax v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -2450,18 +2497,20 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_smin v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_offset: @@ -2490,12 +2539,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -2578,12 +2626,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_smin v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: @@ -2620,12 +2670,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -2681,16 +2730,18 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_smin v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret: @@ -2717,12 +2768,11 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_smin v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -2797,12 +2847,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_smin v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: @@ -2837,12 +2889,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_smin v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -2902,18 +2953,20 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_umin v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_ret_offset: @@ -2942,12 +2995,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3030,12 +3082,14 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_umin v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset: @@ -3072,12 +3126,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -3133,16 +3186,18 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_umin v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_ret: @@ -3169,12 +3224,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_umin v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -3249,12 +3303,14 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_umin v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64: @@ -3289,12 +3345,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_umin v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -3357,18 +3412,20 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i32_ret_offset: @@ -3397,12 +3454,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3488,12 +3544,14 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i32_ret_addr64_offset: @@ -3530,12 +3588,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -3594,16 +3651,18 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i32_ret: @@ -3630,12 +3689,11 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst @@ -3713,12 +3771,14 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i32_ret_addr64: @@ -3753,12 +3813,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -3870,18 +3929,20 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i32_ret_offset: @@ -3910,12 +3971,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4001,12 +4061,14 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset: @@ -4043,12 +4105,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -4107,16 +4168,18 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i32_ret: @@ -4143,12 +4206,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -4226,12 +4288,14 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64: @@ -4266,12 +4330,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -4336,6 +4399,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4343,12 +4408,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset: @@ -4379,12 +4444,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4479,12 +4543,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s9 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: @@ -4525,12 +4591,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s9 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -4590,17 +4655,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret: @@ -4629,12 +4696,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -4721,12 +4787,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s9 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64: @@ -4765,12 +4833,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s9 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -4834,18 +4901,20 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i32_ret_offset: @@ -4874,12 +4943,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4965,12 +5033,14 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset: @@ -5007,12 +5077,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -5071,16 +5140,18 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i32_ret: @@ -5107,12 +5178,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst @@ -5190,12 +5260,14 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64: @@ -5230,12 +5302,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -5248,17 +5319,17 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, s6 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i32_offset: @@ -5270,7 +5341,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -5280,15 +5351,12 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; GCN3-LABEL: atomic_load_i32_offset: ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %in, i32 4 @@ -5301,15 +5369,17 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i32: @@ -5319,7 +5389,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -5329,15 +5399,12 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; GCN3-LABEL: atomic_load_i32: ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_dword v1, v0, s[0:1] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = load atomic i32, ptr %in seq_cst, align 4 @@ -5348,22 +5415,22 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i32_addr64_offset: @@ -5379,7 +5446,7 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -5390,18 +5457,15 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %in, i64 %index @@ -5414,20 +5478,22 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i32_addr64: @@ -5441,7 +5507,7 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -5452,18 +5518,15 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_dword v1, v0, s[0:1] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %in, i64 %index @@ -5475,15 +5538,13 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32_offset: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32_offset: @@ -5501,13 +5562,12 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5518,13 +5578,13 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32: @@ -5540,13 +5600,12 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_dword v0, v1, s[0:1] ; GCN3-NEXT: s_endpgm entry: store atomic i32 %in, ptr %out seq_cst, align 4 @@ -5557,17 +5616,15 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; GCN1-LABEL: atomic_store_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s2 -; GCN1-NEXT: s_addc_u32 s1, s1, s3 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:16 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32_addr64_offset: @@ -5590,14 +5647,13 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s0, s2 ; GCN3-NEXT: s_addc_u32 s1, s1, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s6 +; GCN3-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -5610,15 +5666,16 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; GCN1-LABEL: atomic_store_i32_addr64: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s2 -; GCN1-NEXT: s_addc_u32 s1, s1, s3 +; GCN1-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32_addr64: @@ -5639,14 +5696,13 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s0, s2 ; GCN3-NEXT: s_addc_u32 s1, s1, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, s6 +; GCN3-NEXT: global_store_dword v0, v1, s[0:1] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -5658,17 +5714,17 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, s6 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_f32_offset: @@ -5680,7 +5736,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -5690,15 +5746,12 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; GCN3-LABEL: atomic_load_f32_offset: ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr float, ptr %in, i32 4 @@ -5711,15 +5764,17 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_f32: @@ -5729,7 +5784,7 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -5739,15 +5794,12 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; GCN3-LABEL: atomic_load_f32: ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_dword v1, v0, s[0:1] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = load atomic float, ptr %in seq_cst, align 4 @@ -5758,22 +5810,22 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_f32_addr64_offset: @@ -5789,7 +5841,7 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -5800,18 +5852,15 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr float, ptr %in, i64 %index @@ -5824,20 +5873,22 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_f32_addr64: @@ -5851,7 +5902,7 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -5862,18 +5913,15 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_dword v1, v0, s[0:1] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_store_dword v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr float, ptr %in, i64 %index @@ -5885,15 +5933,13 @@ entry: define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32_offset: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32_offset: @@ -5911,13 +5957,12 @@ define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr float, ptr %out, i32 4 @@ -5928,13 +5973,13 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32: @@ -5950,13 +5995,12 @@ define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_dword v0, v1, s[0:1] ; GCN3-NEXT: s_endpgm entry: store atomic float %in, ptr %out seq_cst, align 4 @@ -5967,17 +6011,15 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; GCN1-LABEL: atomic_store_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s2 -; GCN1-NEXT: s_addc_u32 s1, s1, s3 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:16 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32_addr64_offset: @@ -6000,14 +6042,13 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s0, s2 ; GCN3-NEXT: s_addc_u32 s1, s1, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s6 +; GCN3-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr float, ptr %out, i64 %index @@ -6020,15 +6061,16 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; GCN1-LABEL: atomic_store_f32_addr64: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s2 -; GCN1-NEXT: s_addc_u32 s1, s1, s3 +; GCN1-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32_addr64: @@ -6049,14 +6091,13 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s0, s2 ; GCN3-NEXT: s_addc_u32 s1, s1, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, s6 +; GCN3-NEXT: global_store_dword v0, v1, s[0:1] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr float, ptr %out, i64 %index @@ -6068,17 +6109,17 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, s6 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_byte v[0:1], v2 +; GCN1-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i8_offset: @@ -6090,7 +6131,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -6100,15 +6141,12 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GCN3-LABEL: atomic_load_i8_offset: ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_byte v[0:1], v2 +; GCN3-NEXT: global_store_byte v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr %in, i64 16 @@ -6121,15 +6159,17 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_byte v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i8: @@ -6139,7 +6179,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -6149,15 +6189,12 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; GCN3-LABEL: atomic_load_i8: ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_ubyte v2, v[0:1] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_ubyte v1, v0, s[0:1] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_byte v[0:1], v2 +; GCN3-NEXT: global_store_byte v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = load atomic i8, ptr %in seq_cst, align 1 @@ -6169,20 +6206,20 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; GCN1-LABEL: atomic_load_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_byte v[0:1], v2 +; GCN1-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i8_addr64_offset: @@ -6197,7 +6234,7 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -6208,17 +6245,14 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s0, s0, s6 ; GCN3-NEXT: s_addc_u32 s1, s1, s7 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_byte v[0:1], v2 +; GCN3-NEXT: global_store_byte v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i8, ptr %in, i64 %index @@ -6231,15 +6265,13 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8_offset: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_store_byte v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:16 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8_offset: @@ -6257,13 +6289,12 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_byte v0, v1, s[0:1] offset:16 ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr %out, i64 16 @@ -6274,13 +6305,13 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_store_byte v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8: @@ -6296,13 +6327,12 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_store_byte v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_byte v0, v1, s[0:1] ; GCN3-NEXT: s_endpgm entry: store atomic i8 %in, ptr %out seq_cst, align 1 @@ -6315,14 +6345,12 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, s2 -; GCN1-NEXT: s_addc_u32 s1, s1, s3 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_store_byte v[0:1], v2 +; GCN1-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:16 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8_addr64_offset: @@ -6344,13 +6372,12 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s0, s0, s2 ; GCN3-NEXT: s_addc_u32 s1, s1, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s6 +; GCN3-NEXT: global_store_byte v0, v1, s[0:1] offset:16 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i8, ptr %out, i64 %index @@ -6363,17 +6390,17 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, s6 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i16_offset: @@ -6385,7 +6412,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -6395,15 +6422,12 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GCN3-LABEL: atomic_load_i16_offset: ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_short v[0:1], v2 +; GCN3-NEXT: global_store_short v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr %in, i64 8 @@ -6416,15 +6440,17 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i16: @@ -6434,7 +6460,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -6444,15 +6470,12 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; GCN3-LABEL: atomic_load_i16: ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_ushort v1, v0, s[0:1] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_short v[0:1], v2 +; GCN3-NEXT: global_store_short v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = load atomic i16, ptr %in seq_cst, align 2 @@ -6463,22 +6486,22 @@ entry: define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i16_addr64_offset: @@ -6494,7 +6517,7 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -6505,18 +6528,15 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_short v[0:1], v2 +; GCN3-NEXT: global_store_short v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i16, ptr %in, i64 %index @@ -6529,15 +6549,13 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16_offset: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:16 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16_offset: @@ -6555,13 +6573,12 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr %out, i64 8 @@ -6572,13 +6589,13 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16: @@ -6594,13 +6611,12 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_store_short v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_short v0, v1, s[0:1] ; GCN3-NEXT: s_endpgm entry: store atomic i16 %in, ptr %out seq_cst, align 2 @@ -6611,17 +6627,15 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; GCN1-LABEL: atomic_store_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GCN1-NEXT: s_add_u32 s0, s0, s2 -; GCN1-NEXT: s_addc_u32 s1, s1, s3 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[2:3], 1 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 offset:16 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16_addr64_offset: @@ -6644,14 +6658,13 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GCN3-NEXT: s_add_u32 s0, s0, s2 ; GCN3-NEXT: s_addc_u32 s1, s1, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s6 +; GCN3-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i16, ptr %out, i64 %index @@ -6663,15 +6676,13 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16_offset: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:16 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f16_offset: @@ -6689,13 +6700,12 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_f16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr half, ptr %out, i64 8 @@ -6706,13 +6716,13 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f16: @@ -6728,13 +6738,12 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_f16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_store_short v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_short v0, v1, s[0:1] ; GCN3-NEXT: s_endpgm entry: store atomic half %in, ptr %out seq_cst, align 2 @@ -6744,13 +6753,13 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16_offset: ; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_bf16_offset: @@ -6766,13 +6775,12 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_store_short v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_short v0, v1, s[0:1] ; GCN3-NEXT: s_endpgm %gep = getelementptr bfloat, ptr %out, i64 8 store atomic bfloat %in, ptr %out seq_cst, align 2 @@ -6782,13 +6790,13 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16: ; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_bf16: @@ -6804,13 +6812,12 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: flat_store_short v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_short v0, v1, s[0:1] ; GCN3-NEXT: s_endpgm store atomic bfloat %in, ptr %out seq_cst, align 2 ret void @@ -6970,18 +6977,20 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i32_ret_offset: @@ -7010,12 +7019,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7101,12 +7109,14 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset: @@ -7143,12 +7153,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -7207,16 +7216,18 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i32_ret: @@ -7243,12 +7254,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst @@ -7326,12 +7336,14 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64: @@ -7366,12 +7378,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -7534,18 +7545,20 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i32_ret_offset: @@ -7574,12 +7587,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7665,12 +7677,14 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset: @@ -7707,12 +7721,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -7771,16 +7784,18 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i32_ret: @@ -7807,12 +7822,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst @@ -7890,12 +7904,14 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s8 -; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64: @@ -7930,12 +7946,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 -; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index @@ -7948,17 +7963,17 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, s6 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_f16_offset: @@ -7970,7 +7985,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -7980,15 +7995,12 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GCN3-LABEL: atomic_load_f16_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_short v[0:1], v2 +; GCN3-NEXT: global_store_short v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm %gep = getelementptr half, ptr %in, i64 8 %val = load atomic half, ptr %gep seq_cst, align 2 @@ -8000,15 +8012,17 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_f16: @@ -8018,7 +8032,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -8028,15 +8042,12 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; GCN3-LABEL: atomic_load_f16: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_ushort v1, v0, s[0:1] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_short v[0:1], v2 +; GCN3-NEXT: global_store_short v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm %val = load atomic half, ptr %in seq_cst, align 2 store half %val, ptr %out @@ -8047,17 +8058,17 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, s6 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_bf16_offset: @@ -8069,7 +8080,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -8079,15 +8090,12 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GCN3-LABEL: atomic_load_bf16_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_short v[0:1], v2 +; GCN3-NEXT: global_store_short v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm %gep = getelementptr bfloat, ptr %in, i64 8 %val = load atomic bfloat, ptr %gep seq_cst, align 2 @@ -8099,15 +8107,17 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_short v[0:1], v2 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_bf16: @@ -8117,7 +8127,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 @@ -8127,15 +8137,12 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; GCN3-LABEL: atomic_load_bf16: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: global_load_ushort v1, v0, s[0:1] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_short v[0:1], v2 +; GCN3-NEXT: global_store_short v0, v1, s[2:3] ; GCN3-NEXT: s_endpgm %val = load atomic bfloat, ptr %in seq_cst, align 2 store bfloat %val, ptr %out diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 1311560715ddd..e1c04dd6233f2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -3823,90 +3823,110 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN1-NEXT: s_cbranch_execz .LBB88_3 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dword s3, s[4:5], 0x4 ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: .LBB88_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB88_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB88_2 +; GCN1-NEXT: .LBB88_3: ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN2-NEXT: s_cbranch_execz .LBB88_3 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_add_u32 s4, s0, s4 +; GCN2-NEXT: s_addc_u32 s5, s1, s5 +; GCN2-NEXT: s_load_dword s3, s[4:5], 0x10 +; GCN2-NEXT: s_add_u32 s4, s4, 16 +; GCN2-NEXT: s_addc_u32 s5, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: .LBB88_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB88_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB88_2 +; GCN2-NEXT: .LBB88_3: ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN3-NEXT: s_cbranch_execz .LBB88_3 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN3-NEXT: s_load_dword s3, s[0:1], 0x10 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: .LBB88_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN3-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB88_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB88_2 +; GCN3-NEXT: .LBB88_3: ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -3918,105 +3938,153 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0 +; GCN1-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN1-NEXT: s_cbranch_execz .LBB89_4 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_ashr_i32 s5, s7, 31 -; GCN1-NEXT: s_mov_b32 s4, s7 +; GCN1-NEXT: s_ashr_i32 s5, s9, 31 +; GCN1-NEXT: s_mov_b32 s4, s9 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dword s0, s[4:5], 0x4 +; GCN1-NEXT: s_mov_b64 s[12:13], 0 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB89_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_max_i32_e32 v1, s8, v2 +; GCN1-NEXT: v_mov_b32_e32 v0, v1 +; GCN1-NEXT: v_mov_b32_e32 v1, v2 +; GCN1-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v2 +; GCN1-NEXT: s_or_b64 s[12:13], s[0:1], s[12:13] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[12:13] +; GCN1-NEXT: s_cbranch_execnz .LBB89_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN1-NEXT: .LBB89_4: ; %Flow6 +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_readfirstlane_b32 s0, v0 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: v_bfrev_b32_e32 v1, 1 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: v_max_i32_e32 v0, s0, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN2-NEXT: s_cbranch_execz .LBB89_4 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s5, s7, 31 -; GCN2-NEXT: s_mov_b32 s4, s7 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_ashr_i32 s9, s5, 31 +; GCN2-NEXT: s_mov_b32 s8, s5 +; GCN2-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s8 +; GCN2-NEXT: s_addc_u32 s1, s1, s9 +; GCN2-NEXT: s_load_dword s5, s[0:1], 0x10 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_mov_b64 s[8:9], 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN2-NEXT: v_max_i32_e32 v2, s4, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB89_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GCN2-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB89_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: .LBB89_4: ; %Flow5 +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_readfirstlane_b32 s0, v2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_bfrev_b32_e32 v3, 1 +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_max_i32_e32 v2, s0, v2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: ; implicit-def: $vgpr0 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_cbranch_execz .LBB89_4 +; GCN3-NEXT: ; %bb.1: +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_ashr_i32 s9, s7, 31 +; GCN3-NEXT: s_mov_b32 s8, s7 +; GCN3-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN3-NEXT: s_add_u32 s8, s0, s8 +; GCN3-NEXT: s_addc_u32 s9, s1, s9 +; GCN3-NEXT: s_load_dword s0, s[8:9], 0x10 +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s7, 31 -; GCN3-NEXT: s_mov_b32 s4, s7 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v3, v0 ; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB89_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[8:9] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v3 +; GCN3-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB89_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN3-NEXT: .LBB89_4: ; %Flow6 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_readfirstlane_b32 s0, v0 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_bfrev_b32_e32 v2, 1 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: v_max_i32_e32 v0, s0, v0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -4029,86 +4097,108 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN1-NEXT: s_cbranch_execz .LBB90_3 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dword s3, s[4:5], 0x0 ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: .LBB90_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB90_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB90_2 +; GCN1-NEXT: .LBB90_3: ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN2-NEXT: s_cbranch_execz .LBB90_3 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_add_u32 s4, s0, s4 +; GCN2-NEXT: s_addc_u32 s5, s1, s5 +; GCN2-NEXT: s_load_dword s3, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: .LBB90_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB90_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB90_2 +; GCN2-NEXT: .LBB90_3: ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN3-NEXT: s_cbranch_execz .LBB90_3 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v3, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN3-NEXT: s_load_dword s3, s[0:1], 0x0 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: .LBB90_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN3-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB90_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB90_2 +; GCN3-NEXT: .LBB90_3: ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -4119,101 +4209,151 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0 +; GCN1-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN1-NEXT: s_cbranch_execz .LBB91_4 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_ashr_i32 s5, s7, 31 -; GCN1-NEXT: s_mov_b32 s4, s7 +; GCN1-NEXT: s_ashr_i32 s5, s9, 31 +; GCN1-NEXT: s_mov_b32 s4, s9 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN1-NEXT: s_mov_b64 s[12:13], 0 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB91_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_max_i32_e32 v1, s8, v2 +; GCN1-NEXT: v_mov_b32_e32 v0, v1 +; GCN1-NEXT: v_mov_b32_e32 v1, v2 +; GCN1-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v2 +; GCN1-NEXT: s_or_b64 s[12:13], s[0:1], s[12:13] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[12:13] +; GCN1-NEXT: s_cbranch_execnz .LBB91_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN1-NEXT: .LBB91_4: ; %Flow5 +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_readfirstlane_b32 s0, v0 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: v_bfrev_b32_e32 v1, 1 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: v_max_i32_e32 v0, s0, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN2-NEXT: s_cbranch_execz .LBB91_4 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s5, s7, 31 -; GCN2-NEXT: s_mov_b32 s4, s7 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_ashr_i32 s9, s5, 31 +; GCN2-NEXT: s_mov_b32 s8, s5 +; GCN2-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s8 +; GCN2-NEXT: s_addc_u32 s1, s1, s9 +; GCN2-NEXT: s_load_dword s5, s[0:1], 0x0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_mov_b64 s[8:9], 0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN2-NEXT: v_max_i32_e32 v2, s4, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB91_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GCN2-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB91_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: .LBB91_4: ; %Flow5 +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_readfirstlane_b32 s0, v2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_bfrev_b32_e32 v3, 1 +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_max_i32_e32 v2, s0, v2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: ; implicit-def: $vgpr0 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_cbranch_execz .LBB91_4 +; GCN3-NEXT: ; %bb.1: +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_ashr_i32 s9, s7, 31 +; GCN3-NEXT: s_mov_b32 s8, s7 +; GCN3-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN3-NEXT: s_add_u32 s8, s0, s8 +; GCN3-NEXT: s_addc_u32 s9, s1, s9 +; GCN3-NEXT: s_load_dword s0, s[8:9], 0x0 +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s7, 31 -; GCN3-NEXT: s_mov_b32 s4, s7 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v3, v0 ; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB91_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[8:9] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v3 +; GCN3-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB91_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN3-NEXT: .LBB91_4: ; %Flow5 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_readfirstlane_b32 s0, v0 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_bfrev_b32_e32 v2, 1 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: v_max_i32_e32 v0, s0, v0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -4966,90 +5106,110 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN1-NEXT: s_cbranch_execz .LBB102_3 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dword s3, s[4:5], 0x4 ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: .LBB102_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_max_u32_e32 v0, s2, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB102_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB102_2 +; GCN1-NEXT: .LBB102_3: ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN2-NEXT: s_cbranch_execz .LBB102_3 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_add_u32 s4, s0, s4 +; GCN2-NEXT: s_addc_u32 s5, s1, s5 +; GCN2-NEXT: s_load_dword s3, s[4:5], 0x10 +; GCN2-NEXT: s_add_u32 s4, s4, 16 +; GCN2-NEXT: s_addc_u32 s5, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: .LBB102_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v2, s2, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB102_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB102_2 +; GCN2-NEXT: .LBB102_3: ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN3-NEXT: s_cbranch_execz .LBB102_3 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN3-NEXT: s_load_dword s3, s[0:1], 0x10 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: .LBB102_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_max_u32_e32 v0, s2, v1 +; GCN3-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB102_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB102_2 +; GCN3-NEXT: .LBB102_3: ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -5061,105 +5221,150 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0 +; GCN1-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN1-NEXT: s_cbranch_execz .LBB103_4 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_ashr_i32 s5, s7, 31 -; GCN1-NEXT: s_mov_b32 s4, s7 +; GCN1-NEXT: s_ashr_i32 s5, s9, 31 +; GCN1-NEXT: s_mov_b32 s4, s9 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dword s0, s[4:5], 0x4 +; GCN1-NEXT: s_mov_b64 s[12:13], 0 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB103_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_max_u32_e32 v1, s8, v2 +; GCN1-NEXT: v_mov_b32_e32 v0, v1 +; GCN1-NEXT: v_mov_b32_e32 v1, v2 +; GCN1-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v2 +; GCN1-NEXT: s_or_b64 s[12:13], s[0:1], s[12:13] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[12:13] +; GCN1-NEXT: s_cbranch_execnz .LBB103_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN1-NEXT: .LBB103_4: ; %Flow6 +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_readfirstlane_b32 s0, v0 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: v_max_u32_e32 v0, s0, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN2-NEXT: s_cbranch_execz .LBB103_4 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s5, s7, 31 -; GCN2-NEXT: s_mov_b32 s4, s7 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_ashr_i32 s9, s5, 31 +; GCN2-NEXT: s_mov_b32 s8, s5 +; GCN2-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s8 +; GCN2-NEXT: s_addc_u32 s1, s1, s9 +; GCN2-NEXT: s_load_dword s5, s[0:1], 0x10 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_mov_b64 s[8:9], 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN2-NEXT: v_max_u32_e32 v2, s4, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB103_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GCN2-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB103_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: .LBB103_4: ; %Flow5 +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_readfirstlane_b32 s0, v2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_max_u32_e32 v2, s0, v2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: ; implicit-def: $vgpr0 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_cbranch_execz .LBB103_4 +; GCN3-NEXT: ; %bb.1: +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_ashr_i32 s9, s7, 31 +; GCN3-NEXT: s_mov_b32 s8, s7 +; GCN3-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN3-NEXT: s_add_u32 s8, s0, s8 +; GCN3-NEXT: s_addc_u32 s9, s1, s9 +; GCN3-NEXT: s_load_dword s0, s[8:9], 0x10 +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s7, 31 -; GCN3-NEXT: s_mov_b32 s4, s7 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v3, v0 ; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB103_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[8:9] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v3 +; GCN3-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB103_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN3-NEXT: .LBB103_4: ; %Flow6 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_readfirstlane_b32 s0, v0 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: v_max_u32_e32 v0, s0, v0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -5172,101 +5377,148 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0 +; GCN1-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN1-NEXT: s_cbranch_execz .LBB104_4 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_ashr_i32 s5, s7, 31 -; GCN1-NEXT: s_mov_b32 s4, s7 +; GCN1-NEXT: s_ashr_i32 s5, s9, 31 +; GCN1-NEXT: s_mov_b32 s4, s9 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN1-NEXT: s_mov_b64 s[12:13], 0 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB104_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_max_u32_e32 v1, s8, v2 +; GCN1-NEXT: v_mov_b32_e32 v0, v1 +; GCN1-NEXT: v_mov_b32_e32 v1, v2 +; GCN1-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v2 +; GCN1-NEXT: s_or_b64 s[12:13], s[0:1], s[12:13] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[12:13] +; GCN1-NEXT: s_cbranch_execnz .LBB104_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN1-NEXT: .LBB104_4: ; %Flow5 +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_readfirstlane_b32 s0, v0 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: v_max_u32_e32 v0, s0, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN2-NEXT: s_cbranch_execz .LBB104_4 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s5, s7, 31 -; GCN2-NEXT: s_mov_b32 s4, s7 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_ashr_i32 s9, s5, 31 +; GCN2-NEXT: s_mov_b32 s8, s5 +; GCN2-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s8 +; GCN2-NEXT: s_addc_u32 s1, s1, s9 +; GCN2-NEXT: s_load_dword s5, s[0:1], 0x0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_mov_b64 s[8:9], 0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN2-NEXT: v_max_u32_e32 v2, s4, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB104_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GCN2-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB104_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: .LBB104_4: ; %Flow5 +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_readfirstlane_b32 s0, v2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_max_u32_e32 v2, s0, v2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: ; implicit-def: $vgpr0 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_cbranch_execz .LBB104_4 +; GCN3-NEXT: ; %bb.1: +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_ashr_i32 s9, s7, 31 +; GCN3-NEXT: s_mov_b32 s8, s7 +; GCN3-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN3-NEXT: s_add_u32 s8, s0, s8 +; GCN3-NEXT: s_addc_u32 s9, s1, s9 +; GCN3-NEXT: s_load_dword s0, s[8:9], 0x0 +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s7, 31 -; GCN3-NEXT: s_mov_b32 s4, s7 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v3, v0 ; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB104_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[8:9] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v3 +; GCN3-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB104_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN3-NEXT: .LBB104_4: ; %Flow5 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_readfirstlane_b32 s0, v0 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: v_max_u32_e32 v0, s0, v0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -6760,90 +7012,110 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN1-NEXT: s_cbranch_execz .LBB125_3 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dword s3, s[4:5], 0x4 ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: .LBB125_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_min_i32_e32 v0, s2, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB125_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB125_2 +; GCN1-NEXT: .LBB125_3: ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN2-NEXT: s_cbranch_execz .LBB125_3 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_add_u32 s4, s0, s4 +; GCN2-NEXT: s_addc_u32 s5, s1, s5 +; GCN2-NEXT: s_load_dword s3, s[4:5], 0x10 +; GCN2-NEXT: s_add_u32 s4, s4, 16 +; GCN2-NEXT: s_addc_u32 s5, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: .LBB125_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB125_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB125_2 +; GCN2-NEXT: .LBB125_3: ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN3-NEXT: s_cbranch_execz .LBB125_3 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start +; GCN3-NEXT: s_load_dword s3, s[0:1], 0x10 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: .LBB125_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_min_i32_e32 v0, s2, v1 +; GCN3-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB125_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB125_2 +; GCN3-NEXT: .LBB125_3: ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -6855,105 +7127,153 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0 +; GCN1-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN1-NEXT: s_cbranch_execz .LBB126_4 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_ashr_i32 s5, s7, 31 -; GCN1-NEXT: s_mov_b32 s4, s7 +; GCN1-NEXT: s_ashr_i32 s5, s9, 31 +; GCN1-NEXT: s_mov_b32 s4, s9 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dword s0, s[4:5], 0x4 +; GCN1-NEXT: s_mov_b64 s[12:13], 0 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB126_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_min_i32_e32 v1, s8, v2 +; GCN1-NEXT: v_mov_b32_e32 v0, v1 +; GCN1-NEXT: v_mov_b32_e32 v1, v2 +; GCN1-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v2 +; GCN1-NEXT: s_or_b64 s[12:13], s[0:1], s[12:13] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[12:13] +; GCN1-NEXT: s_cbranch_execnz .LBB126_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN1-NEXT: .LBB126_4: ; %Flow6 +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_readfirstlane_b32 s0, v0 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: v_bfrev_b32_e32 v1, -2 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: v_min_i32_e32 v0, s0, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN2-NEXT: s_cbranch_execz .LBB126_4 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s5, s7, 31 -; GCN2-NEXT: s_mov_b32 s4, s7 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_ashr_i32 s9, s5, 31 +; GCN2-NEXT: s_mov_b32 s8, s5 +; GCN2-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s8 +; GCN2-NEXT: s_addc_u32 s1, s1, s9 +; GCN2-NEXT: s_load_dword s5, s[0:1], 0x10 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_mov_b64 s[8:9], 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN2-NEXT: v_min_i32_e32 v2, s4, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB126_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GCN2-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB126_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: .LBB126_4: ; %Flow5 +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_readfirstlane_b32 s0, v2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_bfrev_b32_e32 v3, -2 +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_min_i32_e32 v2, s0, v2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: ; implicit-def: $vgpr0 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_cbranch_execz .LBB126_4 +; GCN3-NEXT: ; %bb.1: +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_ashr_i32 s9, s7, 31 +; GCN3-NEXT: s_mov_b32 s8, s7 +; GCN3-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN3-NEXT: s_add_u32 s8, s0, s8 +; GCN3-NEXT: s_addc_u32 s9, s1, s9 +; GCN3-NEXT: s_load_dword s0, s[8:9], 0x10 +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s7, 31 -; GCN3-NEXT: s_mov_b32 s4, s7 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v3, v0 ; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB126_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[8:9] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v3 +; GCN3-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB126_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN3-NEXT: .LBB126_4: ; %Flow6 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_readfirstlane_b32 s0, v0 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_bfrev_b32_e32 v2, -2 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: v_min_i32_e32 v0, s0, v0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -6966,74 +7286,96 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN1-NEXT: s_cbranch_execz .LBB127_3 +; GCN1-NEXT: ; %bb.1: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: .LBB127_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB127_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB127_2 +; GCN1-NEXT: .LBB127_3: ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN2-NEXT: s_cbranch_execz .LBB127_3 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_load_dword s3, s[6:7], 0x0 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: .LBB127_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB127_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB127_2 +; GCN2-NEXT: .LBB127_3: ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN3-NEXT: s_cbranch_execz .LBB127_3 +; GCN3-NEXT: ; %bb.1: +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: flat_load_dword v3, v[0:1] -; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: .LBB127_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB127_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_cbranch_execnz .LBB127_2 +; GCN3-NEXT: .LBB127_3: ; GCN3-NEXT: s_endpgm entry: %tmp0 = atomicrmw min ptr %out, i32 %in seq_cst @@ -7043,101 +7385,151 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0 +; GCN1-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN1-NEXT: s_cbranch_execz .LBB128_4 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_ashr_i32 s5, s7, 31 -; GCN1-NEXT: s_mov_b32 s4, s7 +; GCN1-NEXT: s_ashr_i32 s5, s9, 31 +; GCN1-NEXT: s_mov_b32 s4, s9 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dword s0, s[4:5], 0x0 +; GCN1-NEXT: s_mov_b64 s[12:13], 0 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB128_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_min_i32_e32 v1, s8, v2 +; GCN1-NEXT: v_mov_b32_e32 v0, v1 +; GCN1-NEXT: v_mov_b32_e32 v1, v2 +; GCN1-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v2 +; GCN1-NEXT: s_or_b64 s[12:13], s[0:1], s[12:13] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[12:13] +; GCN1-NEXT: s_cbranch_execnz .LBB128_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN1-NEXT: .LBB128_4: ; %Flow5 +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_readfirstlane_b32 s0, v0 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: v_bfrev_b32_e32 v1, -2 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: v_min_i32_e32 v0, s0, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN2-NEXT: s_cbranch_execz .LBB128_4 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s5, s7, 31 -; GCN2-NEXT: s_mov_b32 s4, s7 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_ashr_i32 s9, s5, 31 +; GCN2-NEXT: s_mov_b32 s8, s5 +; GCN2-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s8 +; GCN2-NEXT: s_addc_u32 s1, s1, s9 +; GCN2-NEXT: s_load_dword s5, s[0:1], 0x0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_mov_b64 s[8:9], 0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN2-NEXT: v_min_i32_e32 v2, s4, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB128_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GCN2-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB128_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: .LBB128_4: ; %Flow5 +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_readfirstlane_b32 s0, v2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_bfrev_b32_e32 v3, -2 +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_min_i32_e32 v2, s0, v2 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: ; implicit-def: $vgpr0 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_cbranch_execz .LBB128_4 +; GCN3-NEXT: ; %bb.1: +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_ashr_i32 s9, s7, 31 +; GCN3-NEXT: s_mov_b32 s8, s7 +; GCN3-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 +; GCN3-NEXT: s_add_u32 s8, s0, s8 +; GCN3-NEXT: s_addc_u32 s9, s1, s9 +; GCN3-NEXT: s_load_dword s0, s[8:9], 0x0 +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s7, 31 -; GCN3-NEXT: s_mov_b32 s4, s7 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v3, v0 ; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB128_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[8:9] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v3 +; GCN3-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB128_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN3-NEXT: .LBB128_4: ; %Flow5 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_readfirstlane_b32 s0, v0 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_bfrev_b32_e32 v2, -2 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v1, 0 +; GCN3-NEXT: v_min_i32_e32 v0, s0, v0 +; GCN3-NEXT: global_store_dword v1, v0, s[2:3] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index ffe0596a95e33..5053641967525 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -190,10 +190,12 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB1_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB1_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_ret_offset: @@ -241,7 +243,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB1_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB1_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -279,9 +281,9 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v1, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB1_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB1_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -493,10 +495,12 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB3_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB3_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_ret_addr64_offset: @@ -546,7 +550,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB3_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB3_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -585,9 +589,9 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v1, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB3_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB3_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -776,10 +780,12 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB5_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB5_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_ret: @@ -825,7 +831,7 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB5_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB5_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -862,9 +868,9 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v1, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB5_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB5_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst @@ -1066,10 +1072,12 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB7_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB7_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_ret_addr64: @@ -1117,7 +1125,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB7_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB7_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1155,9 +1163,9 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v1, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB7_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB7_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1349,10 +1357,12 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_and_b32_e32 v5, s5, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB9_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB9_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_ret_offset: @@ -1399,7 +1409,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_and_b32_e32 v5, s5, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB9_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB9_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1436,9 +1446,9 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_and_b32_e32 v3, s5, v1 ; GFX12-NEXT: v_and_b32_e32 v2, s4, v0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB9_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB9_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -1646,10 +1656,12 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_and_b32_e32 v5, s13, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB11_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB11_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_ret_addr64_offset: @@ -1698,7 +1710,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_and_b32_e32 v5, s13, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB11_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB11_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1736,9 +1748,9 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: v_and_b32_e32 v3, s5, v1 ; GFX12-NEXT: v_and_b32_e32 v2, s4, v0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB11_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB11_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1923,10 +1935,12 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: v_and_b32_e32 v5, s5, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB13_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB13_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_ret: @@ -1971,7 +1985,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_and_b32_e32 v5, s5, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB13_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB13_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2007,9 +2021,9 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_and_b32_e32 v3, s5, v1 ; GFX12-NEXT: v_and_b32_e32 v2, s4, v0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB13_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB13_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst @@ -2207,10 +2221,12 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_and_b32_e32 v5, s13, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB15_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB15_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_ret_addr64: @@ -2257,7 +2273,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_and_b32_e32 v5, s13, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB15_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB15_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2294,9 +2310,9 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_and_b32_e32 v3, s5, v1 ; GFX12-NEXT: v_and_b32_e32 v2, s4, v0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB15_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB15_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2492,10 +2508,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB17_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB17_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_ret_offset: @@ -2543,7 +2561,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB17_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB17_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2581,9 +2599,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_subrev_co_ci_u32_e64 v3, null, s5, v1, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB17_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB17_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -2795,10 +2813,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB19_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB19_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_ret_addr64_offset: @@ -2848,7 +2868,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB19_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB19_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2887,9 +2907,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_subrev_co_ci_u32_e64 v3, null, s5, v1, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB19_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB19_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3078,10 +3098,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB21_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB21_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_ret: @@ -3127,7 +3149,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB21_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB21_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3164,9 +3186,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_subrev_co_ci_u32_e64 v3, null, s5, v1, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB21_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB21_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst @@ -3368,10 +3390,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB23_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB23_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_ret_addr64: @@ -3419,7 +3443,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB23_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB23_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3457,9 +3481,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_subrev_co_ci_u32_e64 v3, null, s5, v1, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB23_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB23_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3655,11 +3679,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB25_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: .LBB25_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_offset: @@ -3707,7 +3733,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB25_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB25_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -3746,9 +3772,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB25_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB25_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -3960,11 +3986,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB27_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: .LBB27_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset: @@ -4014,7 +4042,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB27_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB27_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -4054,9 +4082,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB27_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB27_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4245,11 +4273,13 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB29_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: .LBB29_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret: @@ -4295,7 +4325,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB29_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB29_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -4333,9 +4363,9 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB29_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB29_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst @@ -4537,11 +4567,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB31_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: .LBB31_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_addr64: @@ -4589,7 +4621,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB31_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB31_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -4628,9 +4660,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB31_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB31_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4826,11 +4858,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB33_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: .LBB33_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_offset: @@ -4878,7 +4912,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB33_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB33_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -4917,9 +4951,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB33_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB33_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -5131,11 +5165,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB35_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: .LBB35_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset: @@ -5185,7 +5221,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB35_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB35_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -5225,9 +5261,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB35_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB35_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -5416,11 +5452,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB37_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: .LBB37_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret: @@ -5466,7 +5504,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB37_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -5504,9 +5542,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB37_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB37_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst @@ -5708,11 +5746,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB39_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: .LBB39_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64: @@ -5760,7 +5800,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB39_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB39_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -5799,9 +5839,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB39_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB39_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -5997,11 +6037,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB41_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: .LBB41_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_offset: @@ -6049,7 +6091,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB41_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB41_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -6088,9 +6130,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB41_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB41_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -6302,11 +6344,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB43_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: .LBB43_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset: @@ -6356,7 +6400,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB43_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB43_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -6396,9 +6440,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB43_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB43_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6587,11 +6631,13 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB45_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: .LBB45_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret: @@ -6637,7 +6683,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB45_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB45_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -6675,9 +6721,9 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB45_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB45_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst @@ -6879,11 +6925,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB47_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: .LBB47_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_addr64: @@ -6931,7 +6979,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB47_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -6970,9 +7018,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB47_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB47_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -7168,11 +7216,13 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB49_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: .LBB49_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_ret_offset: @@ -7220,7 +7270,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB49_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB49_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -7259,9 +7309,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB49_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB49_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -7473,11 +7523,13 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB51_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: .LBB51_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_ret_addr64_offset: @@ -7527,7 +7579,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB51_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB51_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -7567,9 +7619,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB51_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB51_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -7758,11 +7810,13 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB53_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: .LBB53_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_ret: @@ -7808,7 +7862,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB53_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB53_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -7846,9 +7900,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB53_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB53_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst @@ -8050,11 +8104,13 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB55_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: .LBB55_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_ret_addr64: @@ -8102,7 +8158,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB55_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB55_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -8141,9 +8197,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB55_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB55_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -8335,10 +8391,12 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GCN1-NEXT: v_or_b32_e32 v5, s5, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB57_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB57_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_ret_offset: @@ -8385,7 +8443,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GCN2-NEXT: v_or_b32_e32 v5, s5, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB57_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB57_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -8422,9 +8480,9 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: v_or_b32_e32 v3, s5, v1 ; GFX12-NEXT: v_or_b32_e32 v2, s4, v0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB57_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB57_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -8632,10 +8690,12 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_or_b32_e32 v5, s13, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB59_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB59_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_ret_addr64_offset: @@ -8684,7 +8744,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_or_b32_e32 v5, s13, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB59_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB59_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -8722,9 +8782,9 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: v_or_b32_e32 v3, s5, v1 ; GFX12-NEXT: v_or_b32_e32 v2, s4, v0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB59_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB59_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -8909,10 +8969,12 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: v_or_b32_e32 v5, s5, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB61_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB61_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_ret: @@ -8957,7 +9019,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_or_b32_e32 v5, s5, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB61_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB61_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -8993,9 +9055,9 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_or_b32_e32 v3, s5, v1 ; GFX12-NEXT: v_or_b32_e32 v2, s4, v0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB61_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB61_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst @@ -9193,10 +9255,12 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GCN1-NEXT: v_or_b32_e32 v5, s13, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB63_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB63_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_ret_addr64: @@ -9243,7 +9307,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GCN2-NEXT: v_or_b32_e32 v5, s13, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB63_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB63_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -9280,9 +9344,9 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: v_or_b32_e32 v3, s5, v1 ; GFX12-NEXT: v_or_b32_e32 v2, s4, v0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB63_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB63_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -9713,11 +9777,13 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB67_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: .LBB67_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(2) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret_offset: @@ -9762,7 +9828,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB67_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB67_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(2) @@ -9798,10 +9864,10 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB67_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: .LBB67_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -9996,11 +10062,13 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v2, s13 ; GCN1-NEXT: buffer_store_dword v2, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB69_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: .LBB69_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 ; GCN1-NEXT: s_waitcnt vmcnt(2) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret_addr64_offset: @@ -10047,7 +10115,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v2, s13 ; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB69_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB69_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(2) @@ -10084,10 +10152,10 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB69_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: .LBB69_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -10259,11 +10327,13 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB71_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: .LBB71_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 ; GCN1-NEXT: s_waitcnt vmcnt(2) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret: @@ -10306,7 +10376,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB71_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB71_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(2) @@ -10341,10 +10411,10 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB71_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: .LBB71_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst @@ -10529,11 +10599,13 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v2, s13 ; GCN1-NEXT: buffer_store_dword v2, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB73_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: .LBB73_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 ; GCN1-NEXT: s_waitcnt vmcnt(2) -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret_addr64: @@ -10578,7 +10650,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v2, s13 ; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB73_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB73_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(2) @@ -10614,10 +10686,10 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB73_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: .LBB73_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -10809,10 +10881,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_xor_b32_e32 v5, s5, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB75_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB75_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_ret_offset: @@ -10859,7 +10933,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_xor_b32_e32 v5, s5, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB75_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB75_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -10896,9 +10970,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_xor_b32_e32 v3, s5, v1 ; GFX12-NEXT: v_xor_b32_e32 v2, s4, v0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB75_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB75_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -11106,10 +11180,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_xor_b32_e32 v5, s13, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB77_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB77_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_ret_addr64_offset: @@ -11158,7 +11234,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_xor_b32_e32 v5, s13, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB77_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB77_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -11196,9 +11272,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: v_xor_b32_e32 v3, s5, v1 ; GFX12-NEXT: v_xor_b32_e32 v2, s4, v0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB77_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB77_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -11383,10 +11459,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: v_xor_b32_e32 v5, s5, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB79_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB79_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_ret: @@ -11431,7 +11509,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_xor_b32_e32 v5, s5, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB79_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB79_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -11467,9 +11545,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_xor_b32_e32 v3, s5, v1 ; GFX12-NEXT: v_xor_b32_e32 v2, s4, v0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB79_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB79_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst @@ -11667,10 +11745,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_xor_b32_e32 v5, s13, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB81_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB81_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_ret_addr64: @@ -11717,7 +11797,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_xor_b32_e32 v5, s13, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB81_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -11754,9 +11834,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_xor_b32_e32 v3, s5, v1 ; GFX12-NEXT: v_xor_b32_e32 v2, s4, v0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB81_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB81_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -11769,17 +11849,17 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i64_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, s6 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:32 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i64_offset: @@ -11791,7 +11871,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 @@ -11801,13 +11881,12 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX12-LABEL: atomic_load_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %in, i64 4 @@ -11820,15 +11899,17 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i64: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i64: @@ -11838,7 +11919,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 @@ -11848,13 +11929,12 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GFX12-LABEL: atomic_load_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %val = load atomic i64, ptr %in syncscope("agent") seq_cst, align 8 @@ -11865,22 +11945,22 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:32 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i64_addr64_offset: @@ -11896,7 +11976,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 @@ -11908,16 +11988,15 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %in, i64 %index @@ -11930,20 +12009,22 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i64_addr64: @@ -11957,7 +12038,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 @@ -11969,16 +12050,15 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %in, i64 %index @@ -11991,14 +12071,14 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i64_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: s_add_u32 s0, s2, 32 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i64_offset: @@ -12017,12 +12097,12 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX12-LABEL: atomic_store_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -12034,12 +12114,14 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i64: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i64: @@ -12056,12 +12138,12 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GFX12-LABEL: atomic_store_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: store atomic i64 %in, ptr %out seq_cst, align 8 @@ -12073,17 +12155,16 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; GCN1-NEXT: s_add_u32 s0, s2, s0 -; GCN1-NEXT: s_addc_u32 s1, s3, s1 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: s_mov_b64 s[4:5], s[2:3] ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i64_addr64_offset: @@ -12106,17 +12187,17 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-LABEL: atomic_store_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -12129,16 +12210,17 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GCN1-LABEL: atomic_store_i64_addr64: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; GCN1-NEXT: s_add_u32 s0, s2, s0 -; GCN1-NEXT: s_addc_u32 s1, s3, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: s_mov_b64 s[4:5], s[2:3] ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i64_addr64: @@ -12159,17 +12241,17 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-LABEL: atomic_store_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -12528,10 +12610,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB92_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB92_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_offset: @@ -12581,7 +12665,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB92_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB92_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -12618,9 +12702,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12-NEXT: v_cndmask_b32_e64 v3, v1, s5, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e64 v2, v0, s4, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB92_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB92_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -12840,10 +12924,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN1-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB94_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB94_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: @@ -12897,7 +12983,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN2-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB94_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB94_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -12939,9 +13025,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: v_cndmask_b32_e64 v3, v1, s13, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e64 v2, v0, s12, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 -; GFX12-NEXT: .LBB94_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB94_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[10:11] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -13144,10 +13230,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GCN1-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB96_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB96_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret: @@ -13195,7 +13283,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GCN2-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB96_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB96_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -13231,9 +13319,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GFX12-NEXT: v_cndmask_b32_e64 v3, v1, s5, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e64 v2, v0, s4, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB96_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB96_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -13444,10 +13532,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB98_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB98_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64: @@ -13499,7 +13589,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB98_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB98_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -13540,9 +13630,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: v_cndmask_b32_e64 v3, v1, s13, vcc_lo ; GFX12-NEXT: v_cndmask_b32_e64 v2, v0, s12, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 -; GFX12-NEXT: .LBB98_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB98_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[10:11] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -13556,17 +13646,17 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f64_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, s6 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:32 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_f64_offset: @@ -13578,7 +13668,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 @@ -13588,13 +13678,12 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX12-LABEL: atomic_load_f64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %in, i64 4 @@ -13607,15 +13696,17 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f64: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_f64: @@ -13625,7 +13716,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 @@ -13635,13 +13726,12 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GFX12-LABEL: atomic_load_f64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8 @@ -13652,22 +13742,22 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:32 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_f64_addr64_offset: @@ -13683,7 +13773,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 @@ -13695,16 +13785,15 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %in, i64 %index @@ -13717,20 +13806,22 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: s_mov_b32 s2, 0 +; GCN1-NEXT: s_mov_b32 s3, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_f64_addr64: @@ -13744,7 +13835,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 @@ -13756,16 +13847,15 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %in, i64 %index @@ -13778,14 +13868,14 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GCN1-LABEL: atomic_store_f64_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: s_add_u32 s0, s2, 32 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: s_addc_u32 s1, s3, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f64_offset: @@ -13804,12 +13894,12 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX12-LABEL: atomic_store_f64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %out, i64 4 @@ -13821,12 +13911,14 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GCN1-LABEL: atomic_store_f64: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f64: @@ -13843,12 +13935,12 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GFX12-LABEL: atomic_store_f64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: store atomic double %in, ptr %out seq_cst, align 8 @@ -13860,17 +13952,16 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; GCN1-NEXT: s_add_u32 s0, s2, s0 -; GCN1-NEXT: s_addc_u32 s1, s3, s1 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: s_mov_b64 s[4:5], s[2:3] ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f64_addr64_offset: @@ -13893,17 +13984,17 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-LABEL: atomic_store_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %out, i64 %index @@ -13916,16 +14007,17 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GCN1-LABEL: atomic_store_f64_addr64: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; GCN1-NEXT: s_add_u32 s0, s2, s0 -; GCN1-NEXT: s_addc_u32 s1, s3, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: s_mov_b64 s[4:5], s[2:3] ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f64_addr64: @@ -13946,17 +14038,17 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-LABEL: atomic_store_f64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %out, i64 %index @@ -14160,10 +14252,12 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB108_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB108_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_ret_offset: @@ -14213,7 +14307,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB108_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB108_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -14254,9 +14348,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB108_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB108_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -14477,10 +14571,12 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB110_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB110_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_ret_incr64_offset: @@ -14532,7 +14628,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB110_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB110_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -14574,9 +14670,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB110_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB110_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -14774,10 +14870,12 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB112_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB112_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s2 +; GCN1-NEXT: s_mov_b32 s5, s3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_ret: @@ -14825,7 +14923,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB112_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB112_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -14865,9 +14963,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB112_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB112_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst @@ -15078,10 +15176,12 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB114_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB114_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_ret_incr64: @@ -15131,7 +15231,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB114_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB114_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -15172,9 +15272,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 -; GFX12-NEXT: .LBB114_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB114_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -15392,10 +15492,12 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB116_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB116_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_ret_offset: @@ -15449,7 +15551,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB116_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB116_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -15493,9 +15595,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 -; GFX12-NEXT: .LBB116_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB116_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -15729,10 +15831,12 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB118_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB118_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_ret_decr64_offset: @@ -15788,7 +15892,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB118_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB118_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -15833,9 +15937,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 -; GFX12-NEXT: .LBB118_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB118_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -16046,10 +16150,12 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen -; GCN1-NEXT: .LBB120_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB120_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_ret: @@ -16101,7 +16207,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB120_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB120_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -16144,9 +16250,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 -; GFX12-NEXT: .LBB120_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB120_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst @@ -16370,10 +16476,12 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB122_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB122_4: ; %atomicrmw.phi +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_mov_b32 s0, s10 +; GCN1-NEXT: s_mov_b32 s1, s11 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_ret_decr64: @@ -16427,7 +16535,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB122_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB122_4: ; %atomicrmw.phi ; GCN2-NEXT: v_mov_b32_e32 v2, s10 ; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -16471,9 +16579,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 -; GFX12-NEXT: .LBB122_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: .LBB122_4: ; %atomicrmw.phi +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll index 3c1bc95cc38f6..ed42c8422f69b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -65,9 +65,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_add_i64_ret_offset: @@ -100,8 +102,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -188,9 +190,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_add_i64_ret_addr64_offset: @@ -226,8 +230,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -284,6 +288,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -292,9 +298,9 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_add_i64_ret: @@ -325,8 +331,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 @@ -405,9 +411,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_add_i64_ret_addr64: @@ -441,8 +449,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -513,9 +521,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_ret_offset: @@ -548,8 +558,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -636,9 +646,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_ret_addr64_offset: @@ -674,8 +686,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -732,6 +744,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -740,9 +754,9 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_ret: @@ -773,8 +787,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 @@ -853,9 +867,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_ret_addr64: @@ -889,8 +905,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -961,9 +977,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_ret_offset: @@ -996,8 +1014,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -1084,9 +1102,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_ret_addr64_offset: @@ -1122,8 +1142,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1180,6 +1200,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -1188,9 +1210,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_ret: @@ -1221,8 +1243,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 @@ -1301,9 +1323,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_ret_addr64: @@ -1337,8 +1361,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1406,10 +1430,12 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_offset: @@ -1442,8 +1468,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -1527,10 +1553,12 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: @@ -1566,8 +1594,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1622,6 +1650,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -1629,10 +1659,10 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret: @@ -1663,8 +1693,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 @@ -1740,10 +1770,12 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64: @@ -1777,8 +1809,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1846,10 +1878,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_offset: @@ -1882,8 +1916,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -1967,10 +2001,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: @@ -2006,8 +2042,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2062,6 +2098,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2069,10 +2107,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret: @@ -2103,8 +2141,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 @@ -2180,10 +2218,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64: @@ -2217,8 +2257,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2286,10 +2326,12 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_offset: @@ -2322,8 +2364,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -2407,10 +2449,12 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: @@ -2446,8 +2490,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2502,6 +2546,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2509,10 +2555,10 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret: @@ -2543,8 +2589,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 @@ -2620,10 +2666,12 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64: @@ -2657,8 +2705,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2726,10 +2774,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_ret_offset: @@ -2762,8 +2812,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -2847,10 +2897,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_ret_addr64_offset: @@ -2886,8 +2938,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2942,6 +2994,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2949,10 +3003,10 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_ret: @@ -2983,8 +3037,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 @@ -3060,10 +3114,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_ret_addr64: @@ -3097,8 +3153,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3169,9 +3225,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_ret_offset: @@ -3204,8 +3262,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -3292,9 +3350,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_ret_addr64_offset: @@ -3330,8 +3390,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3388,6 +3448,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -3396,9 +3458,9 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_ret: @@ -3429,8 +3491,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 @@ -3509,9 +3571,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_ret_addr64: @@ -3545,8 +3609,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3711,9 +3775,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xchg_i64_ret_offset: @@ -3746,8 +3812,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -3834,9 +3900,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xchg_i64_ret_addr64_offset: @@ -3872,8 +3940,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3930,6 +3998,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -3938,9 +4008,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xchg_i64_ret: @@ -3971,8 +4041,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 @@ -4051,9 +4121,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xchg_i64_ret_addr64: @@ -4087,8 +4159,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4159,9 +4231,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_ret_offset: @@ -4194,8 +4268,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -4282,9 +4356,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_ret_addr64_offset: @@ -4320,8 +4396,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4378,6 +4454,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -4386,9 +4464,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_ret: @@ -4419,8 +4497,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 @@ -4499,9 +4577,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_ret_addr64: @@ -4535,8 +4615,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4549,17 +4629,17 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX7-LABEL: atomic_load_i64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: s_mov_b32 s2, s6 +; GFX7-NEXT: s_mov_b32 s3, s7 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_load_i64_offset: @@ -4571,7 +4651,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -4581,13 +4661,12 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX12-LABEL: atomic_load_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %in, i64 4 @@ -4600,15 +4679,17 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GFX7-LABEL: atomic_load_i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s0 +; GFX7-NEXT: s_mov_b32 s5, s1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_load_i64: @@ -4618,7 +4699,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -4628,13 +4709,12 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GFX12-LABEL: atomic_load_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %val = load atomic i64, ptr %in syncscope("agent") seq_cst, align 8 @@ -4645,22 +4725,22 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GFX7-LABEL: atomic_load_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, s7 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_load_i64_addr64_offset: @@ -4676,7 +4756,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -4688,16 +4768,15 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %in, i64 %index @@ -4710,20 +4789,22 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) { ; GFX7-LABEL: atomic_load_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, s7 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_load_i64_addr64: @@ -4737,7 +4818,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -4749,16 +4830,15 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %in, i64 %index @@ -4771,14 +4851,14 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX7-LABEL: atomic_store_i64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_add_u32 s0, s2, 32 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_addc_u32 s1, s3, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_store_i64_offset: @@ -4797,12 +4877,12 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX12-LABEL: atomic_store_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -4814,12 +4894,14 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GFX7-LABEL: atomic_store_i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_store_i64: @@ -4836,12 +4918,12 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GFX12-LABEL: atomic_store_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: store atomic i64 %in, ptr %out seq_cst, align 8 @@ -4853,17 +4935,16 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s2, s0 -; GFX7-NEXT: s_addc_u32 s1, s3, s1 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_store_i64_addr64_offset: @@ -4886,17 +4967,17 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-LABEL: atomic_store_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4909,16 +4990,17 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX7-LABEL: atomic_store_i64_addr64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s2, s0 -; GFX7-NEXT: s_addc_u32 s1, s3, s1 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_store_i64_addr64: @@ -4939,17 +5021,17 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-LABEL: atomic_store_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -5085,9 +5167,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_cmpxchg_i64_ret_offset: @@ -5120,8 +5204,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -5212,9 +5296,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s0, s10 +; GFX7-NEXT: s_mov_b32 s1, s11 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: @@ -5255,8 +5341,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[10:11] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -5332,9 +5418,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_cmpxchg_i64_ret: @@ -5365,8 +5453,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 @@ -5449,9 +5537,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s0, s10 +; GFX7-NEXT: s_mov_b32 s1, s11 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64: @@ -5490,8 +5580,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[10:11] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -5505,17 +5595,17 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX7-LABEL: atomic_load_f64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: s_mov_b32 s2, s6 +; GFX7-NEXT: s_mov_b32 s3, s7 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_load_f64_offset: @@ -5527,7 +5617,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -5537,13 +5627,12 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX12-LABEL: atomic_load_f64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %in, i64 4 @@ -5556,15 +5645,17 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GFX7-LABEL: atomic_load_f64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s0 +; GFX7-NEXT: s_mov_b32 s5, s1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_load_f64: @@ -5574,7 +5665,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -5584,13 +5675,12 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GFX12-LABEL: atomic_load_f64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 @@ -5601,22 +5691,22 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GFX7-LABEL: atomic_load_f64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, s7 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_load_f64_addr64_offset: @@ -5632,7 +5722,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -5644,16 +5734,15 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %in, i64 %index @@ -5666,20 +5755,22 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) { ; GFX7-LABEL: atomic_load_f64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, s7 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_load_f64_addr64: @@ -5693,7 +5784,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -5705,16 +5796,15 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %in, i64 %index @@ -5727,14 +5817,14 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX7-LABEL: atomic_store_f64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_add_u32 s0, s2, 32 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_addc_u32 s1, s3, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_store_f64_offset: @@ -5753,12 +5843,12 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX12-LABEL: atomic_store_f64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %out, i64 4 @@ -5770,12 +5860,14 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GFX7-LABEL: atomic_store_f64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_store_f64: @@ -5792,12 +5884,12 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GFX12-LABEL: atomic_store_f64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: store atomic double %in, ptr %out seq_cst, align 8, !noalias.addrspace !0 @@ -5809,17 +5901,16 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s2, s0 -; GFX7-NEXT: s_addc_u32 s1, s3, s1 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_store_f64_addr64_offset: @@ -5842,17 +5933,17 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-LABEL: atomic_store_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %out, i64 %index @@ -5865,16 +5956,17 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX7-LABEL: atomic_store_f64_addr64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s2, s0 -; GFX7-NEXT: s_addc_u32 s1, s3, s1 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_store_f64_addr64: @@ -5895,17 +5987,17 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-LABEL: atomic_store_f64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %out, i64 %index @@ -5975,9 +6067,11 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret_offset: @@ -6010,8 +6104,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -6098,9 +6192,11 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret_incr64_offset: @@ -6136,8 +6232,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6194,6 +6290,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -6202,9 +6300,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret: @@ -6235,8 +6333,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 @@ -6315,9 +6413,11 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret_incr64: @@ -6351,8 +6451,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6423,9 +6523,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret_offset: @@ -6458,8 +6560,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -6546,9 +6648,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret_decr64_offset: @@ -6584,8 +6688,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6642,6 +6746,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -6650,9 +6756,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret: @@ -6683,8 +6789,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 @@ -6763,9 +6869,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret_decr64: @@ -6799,8 +6907,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 23dfe2f70fa7e..d7694d2db2307 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -12168,211 +12168,127 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN1-NEXT: s_cbranch_execz .LBB88_3 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s11 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], -1 -; GCN1-NEXT: s_cbranch_vccnz .LBB88_3 -; GCN1-NEXT: ; %bb.1: ; %Flow6 -; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN1-NEXT: s_cbranch_vccnz .LBB88_6 -; GCN1-NEXT: .LBB88_2: ; %atomicrmw.phi -; GCN1-NEXT: s_endpgm -; GCN1-NEXT: .LBB88_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB88_4: ; %atomicrmw.start +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: .LBB88_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB88_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_branch .LBB88_2 -; GCN1-NEXT: .LBB88_6: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s2 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s0, s0, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v9, v3 +; GCN1-NEXT: v_mov_b32_e32 v8, v2 +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v2, v6 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, v7 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: s_cbranch_execnz .LBB88_2 +; GCN1-NEXT: .LBB88_3: ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN2-NEXT: s_cbranch_execz .LBB88_3 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s6 +; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], -1 -; GCN2-NEXT: s_cbranch_vccnz .LBB88_3 -; GCN2-NEXT: ; %bb.1: ; %Flow6 -; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN2-NEXT: s_cbranch_vccnz .LBB88_6 -; GCN2-NEXT: .LBB88_2: ; %atomicrmw.phi -; GCN2-NEXT: s_endpgm -; GCN2-NEXT: .LBB88_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB88_4: ; %atomicrmw.start +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: .LBB88_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB88_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_branch .LBB88_2 -; GCN2-NEXT: .LBB88_6: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 -; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s2 -; GCN2-NEXT: v_mov_b32_e32 v4, s3 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_cbranch_execnz .LBB88_2 +; GCN2-NEXT: .LBB88_3: ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN3-NEXT: s_cbranch_execz .LBB88_3 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN3-NEXT: s_mov_b32 s14, -1 -; GCN3-NEXT: s_mov_b32 s15, 0xe00000 -; GCN3-NEXT: s_add_u32 s12, s12, s11 -; GCN3-NEXT: s_addc_u32 s13, s13, 0 +; GCN3-NEXT: v_mov_b32_e32 v6, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_addc_u32 s1, s1, s7 -; GCN3-NEXT: s_add_u32 s0, s0, 32 -; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base -; GCN3-NEXT: s_addc_u32 s1, s1, 0 -; GCN3-NEXT: s_cmp_eq_u32 s1, s5 -; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN3-NEXT: s_mov_b64 s[4:5], -1 -; GCN3-NEXT: s_cbranch_vccnz .LBB88_3 -; GCN3-NEXT: ; %bb.1: ; %Flow6 -; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN3-NEXT: s_cbranch_vccnz .LBB88_6 -; GCN3-NEXT: .LBB88_2: ; %atomicrmw.phi -; GCN3-NEXT: s_endpgm -; GCN3-NEXT: .LBB88_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 -; GCN3-NEXT: .LBB88_4: ; %atomicrmw.start +; GCN3-NEXT: v_mov_b32_e32 v4, s3 +; GCN3-NEXT: v_mov_b32_e32 v5, s2 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: .LBB88_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN3-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB88_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_branch .LBB88_2 -; GCN3-NEXT: .LBB88_6: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN3-NEXT: s_cselect_b32 s0, s0, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s2 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4 +; GCN3-NEXT: s_cbranch_execnz .LBB88_2 +; GCN3-NEXT: .LBB88_3: ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -12384,211 +12300,185 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s18, -1 -; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 -; GCN1-NEXT: s_add_u32 s16, s16, s11 -; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 -; GCN1-NEXT: s_addc_u32 s17, s17, 0 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GCN1-NEXT: s_cbranch_execz .LBB89_4 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN1-NEXT: s_add_u32 s0, s8, s0 -; GCN1-NEXT: s_addc_u32 s1, s9, s1 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_cbranch_vccz .LBB89_4 -; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s8, s0, s6 +; GCN1-NEXT: s_addc_u32 s9, s1, s7 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: s_mov_b32 s10, -1 ; GCN1-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, v6 +; GCN1-NEXT: v_mov_b32_e32 v1, v7 +; GCN1-NEXT: v_mov_b32_e32 v2, v8 +; GCN1-NEXT: v_mov_b32_e32 v3, v9 +; GCN1-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_cbranch_execnz .LBB89_2 ; GCN1-NEXT: ; %bb.3: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN1-NEXT: s_branch .LBB89_6 -; GCN1-NEXT: .LBB89_4: -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: s_cbranch_execz .LBB89_6 -; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: .LBB89_4: ; %Flow6 +; GCN1-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_readfirstlane_b32 s1, v1 +; GCN1-NEXT: v_readfirstlane_b32 s0, v0 +; GCN1-NEXT: v_mov_b32_e32 v0, s5 +; GCN1-NEXT: v_bfrev_b32_e32 v1, 1 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v2, s1 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB89_6: ; %atomicrmw.phi -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_mov_b32 s10, -1 +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN2-NEXT: s_cbranch_execz .LBB89_4 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN2-NEXT: s_add_u32 s0, s8, s0 -; GCN2-NEXT: s_addc_u32 s1, s9, s1 +; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s6 +; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x20 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_cbranch_vccz .LBB89_4 -; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s10 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s11 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_cbranch_execnz .LBB89_2 ; GCN2-NEXT: ; %bb.3: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN2-NEXT: s_branch .LBB89_6 -; GCN2-NEXT: .LBB89_4: -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: s_cbranch_execz .LBB89_6 -; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 -; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s12 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB89_6: ; %atomicrmw.phi -; GCN2-NEXT: v_mov_b32_e32 v2, s10 -; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: .LBB89_4: ; %Flow5 +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_readfirstlane_b32 s1, v1 +; GCN2-NEXT: v_readfirstlane_b32 s0, v0 +; GCN2-NEXT: v_mov_b32_e32 v0, s5 +; GCN2-NEXT: v_bfrev_b32_e32 v1, 1 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN3-NEXT: s_mov_b32 s18, -1 -; GCN3-NEXT: s_mov_b32 s19, 0xe00000 -; GCN3-NEXT: s_add_u32 s16, s16, s11 ; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GCN3-NEXT: s_addc_u32 s17, s17, 0 -; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN3-NEXT: s_cbranch_execz .LBB89_4 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN3-NEXT: s_add_u32 s0, s8, s0 -; GCN3-NEXT: s_addc_u32 s1, s9, s1 -; GCN3-NEXT: s_add_u32 s0, s0, 32 -; GCN3-NEXT: s_addc_u32 s1, s1, 0 -; GCN3-NEXT: s_cmp_eq_u32 s1, s3 -; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN3-NEXT: s_cbranch_vccz .LBB89_4 -; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: s_add_u32 s4, s8, s0 +; GCN3-NEXT: s_addc_u32 s5, s9, s1 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s13 +; GCN3-NEXT: v_mov_b32_e32 v3, s12 +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v8, v1 +; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_cmp_lt_i64_e64 s[0:1], s[12:13], v[7:8] +; GCN3-NEXT: v_cndmask_b32_e64 v6, v2, v8, s[0:1] +; GCN3-NEXT: v_cndmask_b32_e64 v5, v3, v7, s[0:1] +; GCN3-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[4:5] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[7:8] +; GCN3-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_cbranch_execnz .LBB89_2 ; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: .LBB89_4: ; %Flow6 ; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN3-NEXT: s_branch .LBB89_6 -; GCN3-NEXT: .LBB89_4: -; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: s_cbranch_execz .LBB89_6 -; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN3-NEXT: s_cselect_b32 s0, s0, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s12 -; GCN3-NEXT: v_mov_b32_e32 v3, s13 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen -; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4 -; GCN3-NEXT: .LBB89_6: ; %atomicrmw.phi -; GCN3-NEXT: v_mov_b32_e32 v2, s10 -; GCN3-NEXT: v_mov_b32_e32 v3, s11 -; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_readfirstlane_b32 s1, v1 +; GCN3-NEXT: v_readfirstlane_b32 s0, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_bfrev_b32_e32 v1, 1 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s12 +; GCN3-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN3-NEXT: v_mov_b32_e32 v3, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN3-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -12601,205 +12491,125 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN1-NEXT: s_cbranch_execz .LBB90_3 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s11 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_cmp_eq_u32 s1, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], -1 -; GCN1-NEXT: s_cbranch_vccnz .LBB90_3 -; GCN1-NEXT: ; %bb.1: ; %Flow6 -; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN1-NEXT: s_cbranch_vccnz .LBB90_6 -; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi -; GCN1-NEXT: s_endpgm -; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB90_4: ; %atomicrmw.start +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: .LBB90_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB90_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_branch .LBB90_2 -; GCN1-NEXT: .LBB90_6: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s2 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s0, s0, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v9, v3 +; GCN1-NEXT: v_mov_b32_e32 v8, v2 +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v2, v6 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, v7 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: s_cbranch_execnz .LBB90_2 +; GCN1-NEXT: .LBB90_3: ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN2-NEXT: s_cbranch_execz .LBB90_3 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: s_cmp_eq_u32 s1, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], -1 -; GCN2-NEXT: s_cbranch_vccnz .LBB90_3 -; GCN2-NEXT: ; %bb.1: ; %Flow6 -; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN2-NEXT: s_cbranch_vccnz .LBB90_6 -; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi -; GCN2-NEXT: s_endpgm -; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_add_u32 s4, s0, s4 +; GCN2-NEXT: s_addc_u32 s5, s1, s5 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB90_4: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: .LBB90_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB90_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_branch .LBB90_2 -; GCN2-NEXT: .LBB90_6: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 -; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s2 -; GCN2-NEXT: v_mov_b32_e32 v4, s3 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: s_cbranch_execnz .LBB90_2 +; GCN2-NEXT: .LBB90_3: ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i64_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN3-NEXT: s_cbranch_execz .LBB90_3 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN3-NEXT: s_mov_b32 s14, -1 -; GCN3-NEXT: s_mov_b32 s15, 0xe00000 -; GCN3-NEXT: s_add_u32 s12, s12, s11 -; GCN3-NEXT: s_addc_u32 s13, s13, 0 +; GCN3-NEXT: v_mov_b32_e32 v6, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base -; GCN3-NEXT: s_addc_u32 s1, s1, s7 -; GCN3-NEXT: s_cmp_eq_u32 s1, s5 -; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN3-NEXT: s_mov_b64 s[4:5], -1 -; GCN3-NEXT: s_cbranch_vccnz .LBB90_3 -; GCN3-NEXT: ; %bb.1: ; %Flow6 -; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN3-NEXT: s_cbranch_vccnz .LBB90_6 -; GCN3-NEXT: .LBB90_2: ; %atomicrmw.phi -; GCN3-NEXT: s_endpgm -; GCN3-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 -; GCN3-NEXT: .LBB90_4: ; %atomicrmw.start +; GCN3-NEXT: v_mov_b32_e32 v4, s3 +; GCN3-NEXT: v_mov_b32_e32 v5, s2 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: .LBB90_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN3-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB90_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_branch .LBB90_2 -; GCN3-NEXT: .LBB90_6: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN3-NEXT: s_cselect_b32 s0, s0, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s2 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4 +; GCN3-NEXT: s_cbranch_execnz .LBB90_2 +; GCN3-NEXT: .LBB90_3: ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -12810,205 +12620,183 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s18, -1 -; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 -; GCN1-NEXT: s_add_u32 s16, s16, s11 -; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 -; GCN1-NEXT: s_addc_u32 s17, s17, 0 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GCN1-NEXT: s_cbranch_execz .LBB91_4 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN1-NEXT: s_add_u32 s0, s8, s0 -; GCN1-NEXT: s_addc_u32 s1, s9, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_cbranch_vccz .LBB91_4 -; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s8, s0, s6 +; GCN1-NEXT: s_addc_u32 s9, s1, s7 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: s_mov_b32 s10, -1 ; GCN1-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, v6 +; GCN1-NEXT: v_mov_b32_e32 v1, v7 +; GCN1-NEXT: v_mov_b32_e32 v2, v8 +; GCN1-NEXT: v_mov_b32_e32 v3, v9 +; GCN1-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_cbranch_execnz .LBB91_2 ; GCN1-NEXT: ; %bb.3: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN1-NEXT: s_branch .LBB91_6 -; GCN1-NEXT: .LBB91_4: -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: s_cbranch_execz .LBB91_6 -; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: .LBB91_4: ; %Flow5 +; GCN1-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_readfirstlane_b32 s1, v1 +; GCN1-NEXT: v_readfirstlane_b32 s0, v0 +; GCN1-NEXT: v_mov_b32_e32 v0, s5 +; GCN1-NEXT: v_bfrev_b32_e32 v1, 1 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v2, s1 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB91_6: ; %atomicrmw.phi -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_mov_b32 s10, -1 +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN2-NEXT: s_cbranch_execz .LBB91_4 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN2-NEXT: s_add_u32 s0, s8, s0 -; GCN2-NEXT: s_addc_u32 s1, s9, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_cbranch_vccz .LBB91_4 -; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s6 +; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s10 +; GCN2-NEXT: v_mov_b32_e32 v1, s11 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_cbranch_execnz .LBB91_2 ; GCN2-NEXT: ; %bb.3: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN2-NEXT: s_branch .LBB91_6 -; GCN2-NEXT: .LBB91_4: -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: s_cbranch_execz .LBB91_6 -; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 -; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s12 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB91_6: ; %atomicrmw.phi -; GCN2-NEXT: v_mov_b32_e32 v2, s10 -; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: .LBB91_4: ; %Flow5 +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_readfirstlane_b32 s1, v1 +; GCN2-NEXT: v_readfirstlane_b32 s0, v0 +; GCN2-NEXT: v_mov_b32_e32 v0, s5 +; GCN2-NEXT: v_bfrev_b32_e32 v1, 1 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN3-NEXT: s_mov_b32 s18, -1 -; GCN3-NEXT: s_mov_b32 s19, 0xe00000 -; GCN3-NEXT: s_add_u32 s16, s16, s11 ; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GCN3-NEXT: s_addc_u32 s17, s17, 0 -; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN3-NEXT: s_cbranch_execz .LBB91_4 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN3-NEXT: s_add_u32 s0, s8, s0 -; GCN3-NEXT: s_addc_u32 s1, s9, s1 -; GCN3-NEXT: s_cmp_eq_u32 s1, s3 -; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN3-NEXT: s_cbranch_vccz .LBB91_4 -; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: s_add_u32 s4, s8, s0 +; GCN3-NEXT: s_addc_u32 s5, s9, s1 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s13 +; GCN3-NEXT: v_mov_b32_e32 v3, s12 +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v8, v1 +; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_cmp_lt_i64_e64 s[0:1], s[12:13], v[7:8] +; GCN3-NEXT: v_cndmask_b32_e64 v6, v2, v8, s[0:1] +; GCN3-NEXT: v_cndmask_b32_e64 v5, v3, v7, s[0:1] +; GCN3-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[4:5] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[7:8] +; GCN3-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_cbranch_execnz .LBB91_2 ; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: .LBB91_4: ; %Flow5 ; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN3-NEXT: s_branch .LBB91_6 -; GCN3-NEXT: .LBB91_4: -; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: s_cbranch_execz .LBB91_6 -; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN3-NEXT: s_cselect_b32 s0, s0, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s12 -; GCN3-NEXT: v_mov_b32_e32 v3, s13 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen -; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4 -; GCN3-NEXT: .LBB91_6: ; %atomicrmw.phi -; GCN3-NEXT: v_mov_b32_e32 v2, s10 -; GCN3-NEXT: v_mov_b32_e32 v3, s11 -; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_readfirstlane_b32 s1, v1 +; GCN3-NEXT: v_readfirstlane_b32 s0, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_bfrev_b32_e32 v1, 1 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s12 +; GCN3-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN3-NEXT: v_mov_b32_e32 v3, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN3-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -14875,211 +14663,127 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN1-NEXT: s_cbranch_execz .LBB102_3 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s11 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], -1 -; GCN1-NEXT: s_cbranch_vccnz .LBB102_3 -; GCN1-NEXT: ; %bb.1: ; %Flow6 -; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN1-NEXT: s_cbranch_vccnz .LBB102_6 -; GCN1-NEXT: .LBB102_2: ; %atomicrmw.phi -; GCN1-NEXT: s_endpgm -; GCN1-NEXT: .LBB102_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB102_4: ; %atomicrmw.start +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: .LBB102_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB102_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_branch .LBB102_2 -; GCN1-NEXT: .LBB102_6: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s2 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s0, s0, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v9, v3 +; GCN1-NEXT: v_mov_b32_e32 v8, v2 +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v2, v6 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, v7 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: s_cbranch_execnz .LBB102_2 +; GCN1-NEXT: .LBB102_3: ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN2-NEXT: s_cbranch_execz .LBB102_3 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s6 +; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], -1 -; GCN2-NEXT: s_cbranch_vccnz .LBB102_3 -; GCN2-NEXT: ; %bb.1: ; %Flow6 -; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN2-NEXT: s_cbranch_vccnz .LBB102_6 -; GCN2-NEXT: .LBB102_2: ; %atomicrmw.phi -; GCN2-NEXT: s_endpgm -; GCN2-NEXT: .LBB102_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB102_4: ; %atomicrmw.start +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: .LBB102_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB102_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_branch .LBB102_2 -; GCN2-NEXT: .LBB102_6: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 -; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s2 -; GCN2-NEXT: v_mov_b32_e32 v4, s3 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_cbranch_execnz .LBB102_2 +; GCN2-NEXT: .LBB102_3: ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN3-NEXT: s_cbranch_execz .LBB102_3 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN3-NEXT: s_mov_b32 s14, -1 -; GCN3-NEXT: s_mov_b32 s15, 0xe00000 -; GCN3-NEXT: s_add_u32 s12, s12, s11 -; GCN3-NEXT: s_addc_u32 s13, s13, 0 +; GCN3-NEXT: v_mov_b32_e32 v6, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_addc_u32 s1, s1, s7 -; GCN3-NEXT: s_add_u32 s0, s0, 32 -; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base -; GCN3-NEXT: s_addc_u32 s1, s1, 0 -; GCN3-NEXT: s_cmp_eq_u32 s1, s5 -; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN3-NEXT: s_mov_b64 s[4:5], -1 -; GCN3-NEXT: s_cbranch_vccnz .LBB102_3 -; GCN3-NEXT: ; %bb.1: ; %Flow6 -; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN3-NEXT: s_cbranch_vccnz .LBB102_6 -; GCN3-NEXT: .LBB102_2: ; %atomicrmw.phi -; GCN3-NEXT: s_endpgm -; GCN3-NEXT: .LBB102_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 -; GCN3-NEXT: .LBB102_4: ; %atomicrmw.start +; GCN3-NEXT: v_mov_b32_e32 v4, s3 +; GCN3-NEXT: v_mov_b32_e32 v5, s2 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: .LBB102_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN3-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB102_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_branch .LBB102_2 -; GCN3-NEXT: .LBB102_6: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN3-NEXT: s_cselect_b32 s0, s0, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s2 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4 +; GCN3-NEXT: s_cbranch_execnz .LBB102_2 +; GCN3-NEXT: .LBB102_3: ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -15091,211 +14795,182 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s18, -1 -; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 -; GCN1-NEXT: s_add_u32 s16, s16, s11 -; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 -; GCN1-NEXT: s_addc_u32 s17, s17, 0 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GCN1-NEXT: s_cbranch_execz .LBB103_4 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN1-NEXT: s_add_u32 s0, s8, s0 -; GCN1-NEXT: s_addc_u32 s1, s9, s1 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_cbranch_vccz .LBB103_4 -; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s8, s0, s6 +; GCN1-NEXT: s_addc_u32 s9, s1, s7 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: s_mov_b32 s10, -1 ; GCN1-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, v6 +; GCN1-NEXT: v_mov_b32_e32 v1, v7 +; GCN1-NEXT: v_mov_b32_e32 v2, v8 +; GCN1-NEXT: v_mov_b32_e32 v3, v9 +; GCN1-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_cbranch_execnz .LBB103_2 ; GCN1-NEXT: ; %bb.3: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN1-NEXT: s_branch .LBB103_6 -; GCN1-NEXT: .LBB103_4: -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: s_cbranch_execz .LBB103_6 -; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: .LBB103_4: ; %Flow6 +; GCN1-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_readfirstlane_b32 s0, v0 +; GCN1-NEXT: v_mov_b32_e32 v0, s5 +; GCN1-NEXT: v_readfirstlane_b32 s1, v1 +; GCN1-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v2, s1 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB103_6: ; %atomicrmw.phi -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_mov_b32 s10, -1 +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN2-NEXT: s_cbranch_execz .LBB103_4 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN2-NEXT: s_add_u32 s0, s8, s0 -; GCN2-NEXT: s_addc_u32 s1, s9, s1 +; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s6 +; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x20 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_cbranch_vccz .LBB103_4 -; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s10 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s11 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_cbranch_execnz .LBB103_2 ; GCN2-NEXT: ; %bb.3: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN2-NEXT: s_branch .LBB103_6 -; GCN2-NEXT: .LBB103_4: -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: s_cbranch_execz .LBB103_6 -; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 -; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s12 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB103_6: ; %atomicrmw.phi -; GCN2-NEXT: v_mov_b32_e32 v2, s10 -; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: .LBB103_4: ; %Flow5 +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_readfirstlane_b32 s0, v0 +; GCN2-NEXT: v_mov_b32_e32 v0, s5 +; GCN2-NEXT: v_readfirstlane_b32 s1, v1 +; GCN2-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN3-NEXT: s_mov_b32 s18, -1 -; GCN3-NEXT: s_mov_b32 s19, 0xe00000 -; GCN3-NEXT: s_add_u32 s16, s16, s11 ; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GCN3-NEXT: s_addc_u32 s17, s17, 0 -; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN3-NEXT: s_cbranch_execz .LBB103_4 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN3-NEXT: s_add_u32 s0, s8, s0 -; GCN3-NEXT: s_addc_u32 s1, s9, s1 -; GCN3-NEXT: s_add_u32 s0, s0, 32 -; GCN3-NEXT: s_addc_u32 s1, s1, 0 -; GCN3-NEXT: s_cmp_eq_u32 s1, s3 -; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN3-NEXT: s_cbranch_vccz .LBB103_4 -; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: s_add_u32 s4, s8, s0 +; GCN3-NEXT: s_addc_u32 s5, s9, s1 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s13 +; GCN3-NEXT: v_mov_b32_e32 v3, s12 +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v8, v1 +; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], v[7:8] +; GCN3-NEXT: v_cndmask_b32_e64 v6, v2, v8, s[0:1] +; GCN3-NEXT: v_cndmask_b32_e64 v5, v3, v7, s[0:1] +; GCN3-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[4:5] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[7:8] +; GCN3-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_cbranch_execnz .LBB103_2 ; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: .LBB103_4: ; %Flow6 ; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN3-NEXT: s_branch .LBB103_6 -; GCN3-NEXT: .LBB103_4: -; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: s_cbranch_execz .LBB103_6 -; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN3-NEXT: s_cselect_b32 s0, s0, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s12 -; GCN3-NEXT: v_mov_b32_e32 v3, s13 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen -; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4 -; GCN3-NEXT: .LBB103_6: ; %atomicrmw.phi -; GCN3-NEXT: v_mov_b32_e32 v2, s10 -; GCN3-NEXT: v_mov_b32_e32 v3, s11 -; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_readfirstlane_b32 s0, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_readfirstlane_b32 s1, v1 +; GCN3-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s12 +; GCN3-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN3-NEXT: v_mov_b32_e32 v3, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN3-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -15308,205 +14983,180 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s18, -1 -; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 -; GCN1-NEXT: s_add_u32 s16, s16, s11 -; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 -; GCN1-NEXT: s_addc_u32 s17, s17, 0 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GCN1-NEXT: s_cbranch_execz .LBB104_4 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN1-NEXT: s_add_u32 s0, s8, s0 -; GCN1-NEXT: s_addc_u32 s1, s9, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_cbranch_vccz .LBB104_4 -; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s8, s0, s6 +; GCN1-NEXT: s_addc_u32 s9, s1, s7 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: s_mov_b32 s10, -1 ; GCN1-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, v6 +; GCN1-NEXT: v_mov_b32_e32 v1, v7 +; GCN1-NEXT: v_mov_b32_e32 v2, v8 +; GCN1-NEXT: v_mov_b32_e32 v3, v9 +; GCN1-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_cbranch_execnz .LBB104_2 ; GCN1-NEXT: ; %bb.3: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN1-NEXT: s_branch .LBB104_6 -; GCN1-NEXT: .LBB104_4: -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: s_cbranch_execz .LBB104_6 -; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: .LBB104_4: ; %Flow5 +; GCN1-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_readfirstlane_b32 s0, v0 +; GCN1-NEXT: v_mov_b32_e32 v0, s5 +; GCN1-NEXT: v_readfirstlane_b32 s1, v1 +; GCN1-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v2, s1 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB104_6: ; %atomicrmw.phi -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_mov_b32 s10, -1 +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN2-NEXT: s_cbranch_execz .LBB104_4 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN2-NEXT: s_add_u32 s0, s8, s0 -; GCN2-NEXT: s_addc_u32 s1, s9, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_cbranch_vccz .LBB104_4 -; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s6 +; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s10 +; GCN2-NEXT: v_mov_b32_e32 v1, s11 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_cbranch_execnz .LBB104_2 ; GCN2-NEXT: ; %bb.3: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN2-NEXT: s_branch .LBB104_6 -; GCN2-NEXT: .LBB104_4: -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: s_cbranch_execz .LBB104_6 -; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 -; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s12 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB104_6: ; %atomicrmw.phi -; GCN2-NEXT: v_mov_b32_e32 v2, s10 -; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: .LBB104_4: ; %Flow5 +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_readfirstlane_b32 s0, v0 +; GCN2-NEXT: v_mov_b32_e32 v0, s5 +; GCN2-NEXT: v_readfirstlane_b32 s1, v1 +; GCN2-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN3-NEXT: s_mov_b32 s18, -1 -; GCN3-NEXT: s_mov_b32 s19, 0xe00000 -; GCN3-NEXT: s_add_u32 s16, s16, s11 ; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GCN3-NEXT: s_addc_u32 s17, s17, 0 -; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN3-NEXT: s_cbranch_execz .LBB104_4 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN3-NEXT: s_add_u32 s0, s8, s0 -; GCN3-NEXT: s_addc_u32 s1, s9, s1 -; GCN3-NEXT: s_cmp_eq_u32 s1, s3 -; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN3-NEXT: s_cbranch_vccz .LBB104_4 -; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: s_add_u32 s4, s8, s0 +; GCN3-NEXT: s_addc_u32 s5, s9, s1 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s13 +; GCN3-NEXT: v_mov_b32_e32 v3, s12 +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v8, v1 +; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], v[7:8] +; GCN3-NEXT: v_cndmask_b32_e64 v6, v2, v8, s[0:1] +; GCN3-NEXT: v_cndmask_b32_e64 v5, v3, v7, s[0:1] +; GCN3-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[4:5] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[7:8] +; GCN3-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_cbranch_execnz .LBB104_2 ; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: .LBB104_4: ; %Flow5 ; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN3-NEXT: s_branch .LBB104_6 -; GCN3-NEXT: .LBB104_4: -; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: s_cbranch_execz .LBB104_6 -; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN3-NEXT: s_cselect_b32 s0, s0, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s12 -; GCN3-NEXT: v_mov_b32_e32 v3, s13 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen -; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4 -; GCN3-NEXT: .LBB104_6: ; %atomicrmw.phi -; GCN3-NEXT: v_mov_b32_e32 v2, s10 -; GCN3-NEXT: v_mov_b32_e32 v3, s11 -; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_readfirstlane_b32 s0, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_readfirstlane_b32 s1, v1 +; GCN3-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s12 +; GCN3-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN3-NEXT: v_mov_b32_e32 v3, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN3-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -19228,211 +18878,127 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN1-NEXT: s_cbranch_execz .LBB125_3 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s11 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], -1 -; GCN1-NEXT: s_cbranch_vccnz .LBB125_3 -; GCN1-NEXT: ; %bb.1: ; %Flow6 -; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN1-NEXT: s_cbranch_vccnz .LBB125_6 -; GCN1-NEXT: .LBB125_2: ; %atomicrmw.phi -; GCN1-NEXT: s_endpgm -; GCN1-NEXT: .LBB125_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB125_4: ; %atomicrmw.start +; GCN1-NEXT: s_add_u32 s4, s0, s4 +; GCN1-NEXT: s_addc_u32 s5, s1, s5 +; GCN1-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: .LBB125_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB125_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_branch .LBB125_2 -; GCN1-NEXT: .LBB125_6: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s2 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s0, s0, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v9, v3 +; GCN1-NEXT: v_mov_b32_e32 v8, v2 +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v2, v6 +; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v3, v7 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: s_cbranch_execnz .LBB125_2 +; GCN1-NEXT: .LBB125_3: ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN2-NEXT: s_cbranch_execz .LBB125_3 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s6 +; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], -1 -; GCN2-NEXT: s_cbranch_vccnz .LBB125_3 -; GCN2-NEXT: ; %bb.1: ; %Flow6 -; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN2-NEXT: s_cbranch_vccnz .LBB125_6 -; GCN2-NEXT: .LBB125_2: ; %atomicrmw.phi -; GCN2-NEXT: s_endpgm -; GCN2-NEXT: .LBB125_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB125_4: ; %atomicrmw.start +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: .LBB125_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB125_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_branch .LBB125_2 -; GCN2-NEXT: .LBB125_6: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 -; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s2 -; GCN2-NEXT: v_mov_b32_e32 v4, s3 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_cbranch_execnz .LBB125_2 +; GCN2-NEXT: .LBB125_3: ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN3-NEXT: s_cbranch_execz .LBB125_3 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN3-NEXT: s_mov_b32 s14, -1 -; GCN3-NEXT: s_mov_b32 s15, 0xe00000 -; GCN3-NEXT: s_add_u32 s12, s12, s11 -; GCN3-NEXT: s_addc_u32 s13, s13, 0 +; GCN3-NEXT: v_mov_b32_e32 v6, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_addc_u32 s1, s1, s7 -; GCN3-NEXT: s_add_u32 s0, s0, 32 -; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base -; GCN3-NEXT: s_addc_u32 s1, s1, 0 -; GCN3-NEXT: s_cmp_eq_u32 s1, s5 -; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN3-NEXT: s_mov_b64 s[4:5], -1 -; GCN3-NEXT: s_cbranch_vccnz .LBB125_3 -; GCN3-NEXT: ; %bb.1: ; %Flow6 -; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN3-NEXT: s_cbranch_vccnz .LBB125_6 -; GCN3-NEXT: .LBB125_2: ; %atomicrmw.phi -; GCN3-NEXT: s_endpgm -; GCN3-NEXT: .LBB125_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 -; GCN3-NEXT: .LBB125_4: ; %atomicrmw.start +; GCN3-NEXT: v_mov_b32_e32 v4, s3 +; GCN3-NEXT: v_mov_b32_e32 v5, s2 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: .LBB125_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN3-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB125_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_branch .LBB125_2 -; GCN3-NEXT: .LBB125_6: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN3-NEXT: s_cselect_b32 s0, s0, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s2 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4 +; GCN3-NEXT: s_cbranch_execnz .LBB125_2 +; GCN3-NEXT: .LBB125_3: ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -19444,211 +19010,185 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s18, -1 -; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 -; GCN1-NEXT: s_add_u32 s16, s16, s11 -; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 -; GCN1-NEXT: s_addc_u32 s17, s17, 0 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GCN1-NEXT: s_cbranch_execz .LBB126_4 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN1-NEXT: s_add_u32 s0, s8, s0 -; GCN1-NEXT: s_addc_u32 s1, s9, s1 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_cbranch_vccz .LBB126_4 -; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s8, s0, s6 +; GCN1-NEXT: s_addc_u32 s9, s1, s7 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: s_mov_b32 s10, -1 ; GCN1-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e64 s[0:1], s[4:5], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, v6 +; GCN1-NEXT: v_mov_b32_e32 v1, v7 +; GCN1-NEXT: v_mov_b32_e32 v2, v8 +; GCN1-NEXT: v_mov_b32_e32 v3, v9 +; GCN1-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_cbranch_execnz .LBB126_2 ; GCN1-NEXT: ; %bb.3: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN1-NEXT: s_branch .LBB126_6 -; GCN1-NEXT: .LBB126_4: -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: s_cbranch_execz .LBB126_6 -; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: .LBB126_4: ; %Flow6 +; GCN1-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_readfirstlane_b32 s1, v1 +; GCN1-NEXT: v_readfirstlane_b32 s0, v0 +; GCN1-NEXT: v_mov_b32_e32 v0, s5 +; GCN1-NEXT: v_bfrev_b32_e32 v1, -2 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v2, s1 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB126_6: ; %atomicrmw.phi -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_mov_b32 s10, -1 +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN2-NEXT: s_cbranch_execz .LBB126_4 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN2-NEXT: s_add_u32 s0, s8, s0 -; GCN2-NEXT: s_addc_u32 s1, s9, s1 +; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s6 +; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x20 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_cbranch_vccz .LBB126_4 -; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s10 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s11 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: v_cmp_ge_i64_e64 s[0:1], s[4:5], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_cbranch_execnz .LBB126_2 ; GCN2-NEXT: ; %bb.3: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN2-NEXT: s_branch .LBB126_6 -; GCN2-NEXT: .LBB126_4: -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: s_cbranch_execz .LBB126_6 -; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 -; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s12 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB126_6: ; %atomicrmw.phi -; GCN2-NEXT: v_mov_b32_e32 v2, s10 -; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: .LBB126_4: ; %Flow5 +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_readfirstlane_b32 s1, v1 +; GCN2-NEXT: v_readfirstlane_b32 s0, v0 +; GCN2-NEXT: v_mov_b32_e32 v0, s5 +; GCN2-NEXT: v_bfrev_b32_e32 v1, -2 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN3-NEXT: s_mov_b32 s18, -1 -; GCN3-NEXT: s_mov_b32 s19, 0xe00000 -; GCN3-NEXT: s_add_u32 s16, s16, s11 ; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GCN3-NEXT: s_addc_u32 s17, s17, 0 -; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN3-NEXT: s_cbranch_execz .LBB126_4 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN3-NEXT: s_add_u32 s0, s8, s0 -; GCN3-NEXT: s_addc_u32 s1, s9, s1 -; GCN3-NEXT: s_add_u32 s0, s0, 32 -; GCN3-NEXT: s_addc_u32 s1, s1, 0 -; GCN3-NEXT: s_cmp_eq_u32 s1, s3 -; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN3-NEXT: s_cbranch_vccz .LBB126_4 -; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: s_add_u32 s4, s8, s0 +; GCN3-NEXT: s_addc_u32 s5, s9, s1 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s13 +; GCN3-NEXT: v_mov_b32_e32 v3, s12 +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v8, v1 +; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_cmp_ge_i64_e64 s[0:1], s[12:13], v[7:8] +; GCN3-NEXT: v_cndmask_b32_e64 v6, v2, v8, s[0:1] +; GCN3-NEXT: v_cndmask_b32_e64 v5, v3, v7, s[0:1] +; GCN3-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[4:5] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[7:8] +; GCN3-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_cbranch_execnz .LBB126_2 ; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: .LBB126_4: ; %Flow6 ; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN3-NEXT: s_branch .LBB126_6 -; GCN3-NEXT: .LBB126_4: -; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: s_cbranch_execz .LBB126_6 -; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN3-NEXT: s_cselect_b32 s0, s0, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s12 -; GCN3-NEXT: v_mov_b32_e32 v3, s13 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen -; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4 -; GCN3-NEXT: .LBB126_6: ; %atomicrmw.phi -; GCN3-NEXT: v_mov_b32_e32 v2, s10 -; GCN3-NEXT: v_mov_b32_e32 v3, s11 -; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_readfirstlane_b32 s1, v1 +; GCN3-NEXT: v_readfirstlane_b32 s0, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_bfrev_b32_e32 v1, -2 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s12 +; GCN3-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN3-NEXT: v_mov_b32_e32 v3, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN3-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -19661,193 +19201,115 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN1-NEXT: s_cbranch_execz .LBB127_3 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s11 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b64 s[8:9], 0 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s1, s4 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], -1 -; GCN1-NEXT: s_cbranch_vccnz .LBB127_3 -; GCN1-NEXT: ; %bb.1: ; %Flow5 -; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN1-NEXT: s_cbranch_vccnz .LBB127_6 -; GCN1-NEXT: .LBB127_2: ; %atomicrmw.phi -; GCN1-NEXT: s_endpgm -; GCN1-NEXT: .LBB127_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB127_4: ; %atomicrmw.start +; GCN1-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: .LBB127_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB127_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_branch .LBB127_2 -; GCN1-NEXT: .LBB127_6: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s2 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s0, s0, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v9, v3 +; GCN1-NEXT: v_mov_b32_e32 v8, v2 +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v2, v6 +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN1-NEXT: v_mov_b32_e32 v3, v7 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB127_2 +; GCN1-NEXT: .LBB127_3: ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN2-NEXT: s_cbranch_execz .LBB127_3 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 -; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s1, s4 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], -1 -; GCN2-NEXT: s_cbranch_vccnz .LBB127_3 -; GCN2-NEXT: ; %bb.1: ; %Flow5 -; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN2-NEXT: s_cbranch_vccnz .LBB127_6 -; GCN2-NEXT: .LBB127_2: ; %atomicrmw.phi -; GCN2-NEXT: s_endpgm -; GCN2-NEXT: .LBB127_3: ; %atomicrmw.global +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB127_4: ; %atomicrmw.start +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: .LBB127_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB127_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_branch .LBB127_2 -; GCN2-NEXT: .LBB127_6: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 -; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s2 -; GCN2-NEXT: v_mov_b32_e32 v4, s3 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_cbranch_execnz .LBB127_2 +; GCN2-NEXT: .LBB127_3: ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN3-NEXT: s_cbranch_execz .LBB127_3 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN3-NEXT: s_mov_b32 s14, -1 -; GCN3-NEXT: s_mov_b32 s15, 0xe00000 -; GCN3-NEXT: s_add_u32 s12, s12, s11 -; GCN3-NEXT: s_addc_u32 s13, s13, 0 -; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base -; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_cmp_eq_u32 s1, s5 -; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN3-NEXT: s_mov_b64 s[4:5], -1 -; GCN3-NEXT: s_cbranch_vccnz .LBB127_3 -; GCN3-NEXT: ; %bb.1: ; %Flow5 -; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5] -; GCN3-NEXT: s_cbranch_vccnz .LBB127_6 -; GCN3-NEXT: .LBB127_2: ; %atomicrmw.phi -; GCN3-NEXT: s_endpgm -; GCN3-NEXT: .LBB127_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 -; GCN3-NEXT: .LBB127_4: ; %atomicrmw.start +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GCN3-NEXT: v_mov_b32_e32 v5, s3 +; GCN3-NEXT: v_mov_b32_e32 v6, s2 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: .LBB127_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB127_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_branch .LBB127_2 -; GCN3-NEXT: .LBB127_6: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN3-NEXT: s_cselect_b32 s0, s0, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s2 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4 +; GCN3-NEXT: s_cbranch_execnz .LBB127_2 +; GCN3-NEXT: .LBB127_3: ; GCN3-NEXT: s_endpgm entry: %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst @@ -19857,205 +19319,183 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s18, -1 -; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 -; GCN1-NEXT: s_add_u32 s16, s16, s11 -; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 -; GCN1-NEXT: s_addc_u32 s17, s17, 0 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GCN1-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN1-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GCN1-NEXT: s_cbranch_execz .LBB128_4 +; GCN1-NEXT: ; %bb.1: ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN1-NEXT: s_add_u32 s0, s8, s0 -; GCN1-NEXT: s_addc_u32 s1, s9, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_cbranch_vccz .LBB128_4 -; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s8, s0, s6 +; GCN1-NEXT: s_addc_u32 s9, s1, s7 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: s_mov_b32 s10, -1 ; GCN1-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e64 s[0:1], s[4:5], v[8:9] +; GCN1-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, v6 +; GCN1-NEXT: v_mov_b32_e32 v1, v7 +; GCN1-NEXT: v_mov_b32_e32 v2, v8 +; GCN1-NEXT: v_mov_b32_e32 v3, v9 +; GCN1-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_cbranch_execnz .LBB128_2 ; GCN1-NEXT: ; %bb.3: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN1-NEXT: s_branch .LBB128_6 -; GCN1-NEXT: .LBB128_4: -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: s_cbranch_execz .LBB128_6 -; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: .LBB128_4: ; %Flow5 +; GCN1-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_readfirstlane_b32 s1, v1 +; GCN1-NEXT: v_readfirstlane_b32 s0, v0 +; GCN1-NEXT: v_mov_b32_e32 v0, s5 +; GCN1-NEXT: v_bfrev_b32_e32 v1, -2 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v2, s1 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen -; GCN1-NEXT: .LBB128_6: ; %atomicrmw.phi -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_mov_b32 s10, -1 +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 -; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GCN2-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN2-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN2-NEXT: s_cbranch_execz .LBB128_4 +; GCN2-NEXT: ; %bb.1: ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN2-NEXT: s_add_u32 s0, s8, s0 -; GCN2-NEXT: s_addc_u32 s1, s9, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_cbranch_vccz .LBB128_4 -; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s6 +; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s10 +; GCN2-NEXT: v_mov_b32_e32 v1, s11 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GCN2-NEXT: v_cmp_ge_i64_e64 s[0:1], s[4:5], v[8:9] +; GCN2-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_cbranch_execnz .LBB128_2 ; GCN2-NEXT: ; %bb.3: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN2-NEXT: s_branch .LBB128_6 -; GCN2-NEXT: .LBB128_4: -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: s_cbranch_execz .LBB128_6 -; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 -; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s12 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen -; GCN2-NEXT: .LBB128_6: ; %atomicrmw.phi -; GCN2-NEXT: v_mov_b32_e32 v2, s10 -; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: .LBB128_4: ; %Flow5 +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_readfirstlane_b32 s1, v1 +; GCN2-NEXT: v_readfirstlane_b32 s0, v0 +; GCN2-NEXT: v_mov_b32_e32 v0, s5 +; GCN2-NEXT: v_bfrev_b32_e32 v1, -2 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN3-NEXT: s_mov_b32 s18, -1 -; GCN3-NEXT: s_mov_b32 s19, 0xe00000 -; GCN3-NEXT: s_add_u32 s16, s16, s11 ; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GCN3-NEXT: s_addc_u32 s17, s17, 0 -; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base +; GCN3-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GCN3-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN3-NEXT: s_cbranch_execz .LBB128_4 +; GCN3-NEXT: ; %bb.1: ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GCN3-NEXT: s_add_u32 s0, s8, s0 -; GCN3-NEXT: s_addc_u32 s1, s9, s1 -; GCN3-NEXT: s_cmp_eq_u32 s1, s3 -; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN3-NEXT: s_cbranch_vccz .LBB128_4 -; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: s_add_u32 s4, s8, s0 +; GCN3-NEXT: s_addc_u32 s5, s9, s1 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s13 +; GCN3-NEXT: v_mov_b32_e32 v3, s12 +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v8, v1 +; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_cmp_ge_i64_e64 s[0:1], s[12:13], v[7:8] +; GCN3-NEXT: v_cndmask_b32_e64 v6, v2, v8, s[0:1] +; GCN3-NEXT: v_cndmask_b32_e64 v5, v3, v7, s[0:1] +; GCN3-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[4:5] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[7:8] +; GCN3-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_cbranch_execnz .LBB128_2 ; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: .LBB128_4: ; %Flow5 ; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN3-NEXT: s_branch .LBB128_6 -; GCN3-NEXT: .LBB128_4: -; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: s_cbranch_execz .LBB128_6 -; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN3-NEXT: s_cselect_b32 s0, s0, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s12 -; GCN3-NEXT: v_mov_b32_e32 v3, s13 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen -; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4 -; GCN3-NEXT: .LBB128_6: ; %atomicrmw.phi -; GCN3-NEXT: v_mov_b32_e32 v2, s10 -; GCN3-NEXT: v_mov_b32_e32 v3, s11 -; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: v_readfirstlane_b32 s1, v1 +; GCN3-NEXT: v_readfirstlane_b32 s0, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_bfrev_b32_e32 v1, -2 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s12 +; GCN3-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN3-NEXT: v_mov_b32_e32 v3, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN3-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index fe47461ebf956..8cf333b51f42d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -4258,102 +4258,127 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_max_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_cbranch_execz .LBB88_3 +; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_add_u32 s4, s0, s4 +; GFX7-NEXT: s_addc_u32 s5, s1, s5 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v4, s3 +; GFX7-NEXT: v_mov_b32_e32 v5, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: .LBB88_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, v6 ; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB88_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_cbranch_execnz .LBB88_2 +; GFX7-NEXT: .LBB88_3: ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_cbranch_execz .LBB88_3 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB88_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB88_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB88_2 +; GFX8-NEXT: .LBB88_3: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB88_3 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 -; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .LBB88_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB88_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB88_2 +; GFX9-NEXT: .LBB88_3: ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4366,110 +4391,184 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GFX7-NEXT: s_cbranch_execz .LBB89_4 +; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s6 -; GFX7-NEXT: s_addc_u32 s1, s1, s7 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_add_u32 s8, s0, s6 +; GFX7-NEXT: s_addc_u32 s9, s1, s7 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: .LBB89_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB89_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB89_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: .LBB89_4: ; %Flow6 +; GFX7-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s8, s2 +; GFX7-NEXT: s_mov_b32 s9, s3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX8-NEXT: s_cbranch_execz .LBB89_4 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s6 ; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x20 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB89_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB89_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB89_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: .LBB89_4: ; %Flow5 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB89_4 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GFX9-NEXT: s_add_u32 s0, s8, s0 -; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: s_add_u32 s4, s8, s0 +; GFX9-NEXT: s_addc_u32 s5, s9, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB89_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB89_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[12:13], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v2, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v3, v7, s[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[4:5] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB89_2 +; GFX9-NEXT: ; %bb.3: ; %Flow +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB89_4: ; %Flow6 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4482,98 +4581,125 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_max_i64_addr64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_cbranch_execz .LBB90_3 +; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_add_u32 s4, s0, s4 +; GFX7-NEXT: s_addc_u32 s5, s1, s5 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v4, s3 +; GFX7-NEXT: v_mov_b32_e32 v5, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: .LBB90_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, v6 ; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB90_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_cbranch_execnz .LBB90_2 +; GFX7-NEXT: .LBB90_3: ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_addr64: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_cbranch_execz .LBB90_3 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_add_u32 s4, s0, s4 +; GFX8-NEXT: s_addc_u32 s5, s1, s5 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB90_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB90_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_cbranch_execnz .LBB90_2 +; GFX8-NEXT: .LBB90_3: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_addr64: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB90_3 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 -; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .LBB90_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB90_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB90_2 +; GFX9-NEXT: .LBB90_3: ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4585,106 +4711,182 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-LABEL: atomic_max_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GFX7-NEXT: s_cbranch_execz .LBB91_4 +; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s6 -; GFX7-NEXT: s_addc_u32 s1, s1, s7 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_add_u32 s8, s0, s6 +; GFX7-NEXT: s_addc_u32 s9, s1, s7 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: .LBB91_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB91_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB91_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: .LBB91_4: ; %Flow5 +; GFX7-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s8, s2 +; GFX7-NEXT: s_mov_b32 s9, s3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX8-NEXT: s_cbranch_execz .LBB91_4 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s6 ; GFX8-NEXT: s_addc_u32 s1, s1, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB91_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB91_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB91_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: .LBB91_4: ; %Flow5 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB91_4 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GFX9-NEXT: s_add_u32 s0, s8, s0 -; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: s_add_u32 s4, s8, s0 +; GFX9-NEXT: s_addc_u32 s5, s9, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB91_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB91_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[12:13], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v2, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v3, v7, s[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB91_2 +; GFX9-NEXT: ; %bb.3: ; %Flow +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB91_4: ; %Flow5 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -5640,102 +5842,127 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umax_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_cbranch_execz .LBB102_3 +; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_add_u32 s4, s0, s4 +; GFX7-NEXT: s_addc_u32 s5, s1, s5 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v4, s3 +; GFX7-NEXT: v_mov_b32_e32 v5, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: .LBB102_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, v6 ; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB102_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_cbranch_execnz .LBB102_2 +; GFX7-NEXT: .LBB102_3: ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_cbranch_execz .LBB102_3 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB102_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB102_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB102_2 +; GFX8-NEXT: .LBB102_3: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB102_3 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 -; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .LBB102_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB102_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB102_2 +; GFX9-NEXT: .LBB102_3: ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -5748,110 +5975,181 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GFX7-NEXT: s_cbranch_execz .LBB103_4 +; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s6 -; GFX7-NEXT: s_addc_u32 s1, s1, s7 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_add_u32 s8, s0, s6 +; GFX7-NEXT: s_addc_u32 s9, s1, s7 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: .LBB103_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB103_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB103_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: .LBB103_4: ; %Flow6 +; GFX7-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s8, s2 +; GFX7-NEXT: s_mov_b32 s9, s3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX8-NEXT: s_cbranch_execz .LBB103_4 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s6 ; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x20 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB103_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB103_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB103_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: .LBB103_4: ; %Flow5 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB103_4 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GFX9-NEXT: s_add_u32 s0, s8, s0 -; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: s_add_u32 s4, s8, s0 +; GFX9-NEXT: s_addc_u32 s5, s9, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB103_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB103_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v2, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v3, v7, s[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[4:5] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB103_2 +; GFX9-NEXT: ; %bb.3: ; %Flow +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB103_4: ; %Flow6 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -5865,106 +6163,179 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7-LABEL: atomic_umax_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GFX7-NEXT: s_cbranch_execz .LBB104_4 +; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s6 -; GFX7-NEXT: s_addc_u32 s1, s1, s7 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_add_u32 s8, s0, s6 +; GFX7-NEXT: s_addc_u32 s9, s1, s7 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: .LBB104_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB104_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB104_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: .LBB104_4: ; %Flow5 +; GFX7-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s8, s2 +; GFX7-NEXT: s_mov_b32 s9, s3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX8-NEXT: s_cbranch_execz .LBB104_4 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s6 ; GFX8-NEXT: s_addc_u32 s1, s1, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB104_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB104_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB104_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: .LBB104_4: ; %Flow5 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB104_4 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GFX9-NEXT: s_add_u32 s0, s8, s0 -; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: s_add_u32 s4, s8, s0 +; GFX9-NEXT: s_addc_u32 s5, s9, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB104_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB104_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v2, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v3, v7, s[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB104_2 +; GFX9-NEXT: ; %bb.3: ; %Flow +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB104_4: ; %Flow5 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -7864,102 +8235,127 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_min_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_cbranch_execz .LBB125_3 +; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_add_u32 s4, s0, s4 +; GFX7-NEXT: s_addc_u32 s5, s1, s5 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v4, s3 +; GFX7-NEXT: v_mov_b32_e32 v5, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: .LBB125_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, v6 ; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB125_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_cbranch_execnz .LBB125_2 +; GFX7-NEXT: .LBB125_3: ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_cbranch_execz .LBB125_3 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB125_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB125_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB125_2 +; GFX8-NEXT: .LBB125_3: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB125_3 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 -; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .LBB125_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB125_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB125_2 +; GFX9-NEXT: .LBB125_3: ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -7972,110 +8368,184 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GFX7-NEXT: s_cbranch_execz .LBB126_4 +; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s6 -; GFX7-NEXT: s_addc_u32 s1, s1, s7 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_add_u32 s8, s0, s6 +; GFX7-NEXT: s_addc_u32 s9, s1, s7 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: .LBB126_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB126_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_i64_e64 s[0:1], s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB126_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: .LBB126_4: ; %Flow6 +; GFX7-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s8, s2 +; GFX7-NEXT: s_mov_b32 s9, s3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX8-NEXT: s_cbranch_execz .LBB126_4 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s6 ; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x20 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB126_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB126_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_i64_e64 s[0:1], s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB126_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: .LBB126_4: ; %Flow5 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB126_4 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GFX9-NEXT: s_add_u32 s0, s8, s0 -; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: s_add_u32 s4, s8, s0 +; GFX9-NEXT: s_addc_u32 s5, s9, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB126_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB126_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_ge_i64_e64 s[0:1], s[12:13], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v2, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v3, v7, s[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[4:5] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB126_2 +; GFX9-NEXT: ; %bb.3: ; %Flow +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB126_4: ; %Flow6 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -8088,92 +8558,115 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_min_i64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_cbranch_execz .LBB127_3 +; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX7-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s4, s0 +; GFX7-NEXT: s_mov_b32 s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s3 +; GFX7-NEXT: v_mov_b32_e32 v5, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-NEXT: .LBB127_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB127_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB127_2 +; GFX7-NEXT: .LBB127_3: ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_cbranch_execz .LBB127_3 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: .LBB127_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB127_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_cbranch_execnz .LBB127_2 +; GFX8-NEXT: .LBB127_3: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB127_3 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .LBB127_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB127_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB127_2 +; GFX9-NEXT: .LBB127_3: ; GFX9-NEXT: s_endpgm entry: %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst, !noalias.addrspace !1 @@ -8184,106 +8677,182 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-LABEL: atomic_min_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GFX7-NEXT: s_cbranch_execz .LBB128_4 +; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s6 -; GFX7-NEXT: s_addc_u32 s1, s1, s7 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_add_u32 s8, s0, s6 +; GFX7-NEXT: s_addc_u32 s9, s1, s7 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB128_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: .LBB128_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB128_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_i64_e64 s[0:1], s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB128_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: .LBB128_4: ; %Flow5 +; GFX7-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s8, s2 +; GFX7-NEXT: s_mov_b32 s9, s3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX8-NEXT: s_cbranch_execz .LBB128_4 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s6 ; GFX8-NEXT: s_addc_u32 s1, s1, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: .LBB128_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB128_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB128_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_i64_e64 s[0:1], s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v4, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v5, v8, s[0:1] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB128_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: .LBB128_4: ; %Flow5 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB128_4 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GFX9-NEXT: s_add_u32 s0, s8, s0 -; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: s_add_u32 s4, s8, s0 +; GFX9-NEXT: s_addc_u32 s5, s9, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB128_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB128_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_ge_i64_e64 s[0:1], s[12:13], v[7:8] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v2, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v3, v7, s[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB128_2 +; GFX9-NEXT: ; %bb.3: ; %Flow +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB128_4: ; %Flow5 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll index 76056d7fa1b17..8529d4988be2f 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll @@ -8,27 +8,48 @@ declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> % define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX942-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB0_2 +; GFX942-NEXT: ; %bb.1: +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f32 v0, v1, s[2:3] sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: .LBB0_2: ; GFX942-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB0_2 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: .LBB0_2: ; GFX12-NEXT: s_endpgm %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 ret void @@ -37,27 +58,48 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX942-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX942-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB1_2 +; GFX942-NEXT: ; %bb.1: +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX942-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f32 v0, v1, s[2:3] sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 +; GFX942-NEXT: .LBB1_2: ; GFX942-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB1_2 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: .LBB1_2: ; GFX12-NEXT: s_endpgm %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 873fceedd7b72..b37bb7fee664c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1256,12 +1256,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm @@ -1269,12 +1269,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX942: ; %bb.0: ; %main_body ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1 -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_endpgm main_body: @@ -1287,23 +1287,23 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX942: ; %bb.0: ; %main_body ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_endpgm main_body: @@ -1316,11 +1316,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1329,11 +1329,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX942: ; %bb.0: ; %main_body ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_endpgm @@ -1426,23 +1426,23 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm @@ -1450,12 +1450,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX942: ; %bb.0: ; %main_body ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index ec80efc5f0362..65ae09b8d5bdf 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -109,108 +109,102 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ret void } -define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { +; FIXME: this function had amdgpu_kernel CC, but was removed as part of PR#137488. +; Need to add them back once we move infer address space pass to middle end. +define void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 -; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8V4-NEXT: s_load_dword s4, s[6:7], 0x40 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V4-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 +; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8V4-NEXT: flat_store_dword v[0:1], v0 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) -; GFX8V4-NEXT: s_endpgm +; GFX8V4-NEXT: s_setpc_b64 s[30:31] ; ; GFX8V5-LABEL: llvm_amdgcn_is_shared: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc -; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8V5-NEXT: s_mov_b64 s[4:5], 0xc4 +; GFX8V5-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V5-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 +; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8V5-NEXT: flat_store_dword v[0:1], v0 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) -; GFX8V5-NEXT: s_endpgm +; GFX8V5-NEXT: s_setpc_b64 s[30:31] ; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dword s2, s[8:9], 0x4 -; GFX9V4-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1 -; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9V4-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 +; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off ; GFX9V4-NEXT: s_waitcnt vmcnt(0) -; GFX9V4-NEXT: s_endpgm +; GFX9V4-NEXT: s_setpc_b64 s[30:31] ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s2, s[8:9], 0x4 -; GFX9V5-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 -; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V5-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9V5-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 +; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off ; GFX9V5-NEXT: s_waitcnt vmcnt(0) -; GFX9V5-NEXT: s_endpgm +; GFX9V5-NEXT: s_setpc_b64 s[30:31] %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %ptr) %zext = zext i1 %is.shared to i32 store volatile i32 %zext, ptr addrspace(1) poison ret void } -define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { +; FIXME: this function had amdgpu_kernel CC, but was removed as part of PR#137488. +; Need to add them back once we move infer address space pass to middle end. +define void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 -; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8V4-NEXT: s_load_dword s4, s[6:7], 0x44 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V4-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 +; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8V4-NEXT: flat_store_dword v[0:1], v0 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) -; GFX8V4-NEXT: s_endpgm +; GFX8V4-NEXT: s_setpc_b64 s[30:31] ; ; GFX8V5-LABEL: llvm_amdgcn_is_private: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8 -; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8V5-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8V5-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V5-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 +; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8V5-NEXT: flat_store_dword v[0:1], v0 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) -; GFX8V5-NEXT: s_endpgm +; GFX8V5-NEXT: s_setpc_b64 s[30:31] ; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dword s2, s[8:9], 0x4 -; GFX9V4-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1 -; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9V4-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX9V4-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 +; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off ; GFX9V4-NEXT: s_waitcnt vmcnt(0) -; GFX9V4-NEXT: s_endpgm +; GFX9V4-NEXT: s_setpc_b64 s[30:31] ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s2, s[8:9], 0x4 -; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 -; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V5-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9V5-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX9V5-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 +; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off ; GFX9V5-NEXT: s_waitcnt vmcnt(0) -; GFX9V5-NEXT: s_endpgm +; GFX9V5-NEXT: s_setpc_b64 s[30:31] %is.private = call i1 @llvm.amdgcn.is.private(ptr %ptr) %zext = zext i1 %is.private to i32 store volatile i32 %zext, ptr addrspace(1) poison diff --git a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll index 55a5d50f06bbd..f59b6065cd07f 100644 --- a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll @@ -6,8 +6,6 @@ define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s4, s6, 16 @@ -15,11 +13,9 @@ define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) { ; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 6 ; CHECK-NEXT: s_add_u32 s0, s0, s4 ; CHECK-NEXT: s_addc_u32 s1, s1, s5 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:4 -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_load_ubyte v1, v0, s[0:1] offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: global_store_byte v0, v1, s[2:3] ; CHECK-NEXT: s_endpgm entry: %disp1 = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll index 7ffc2a6987742..25f442395f5f2 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -242,77 +242,60 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; SPLIT-LABEL: test_flat_misaligned_v2: ; SPLIT: ; %bb.0: ; %bb ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SPLIT-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 4 -; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; SPLIT-NEXT: s_clause 0x1 -; SPLIT-NEXT: flat_load_dword v4, v[2:3] -; SPLIT-NEXT: flat_load_dword v5, v[0:1] -; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; SPLIT-NEXT: flat_store_dword v[0:1], v4 -; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) -; SPLIT-NEXT: flat_store_dword v[2:3], v5 +; SPLIT-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; SPLIT-NEXT: s_waitcnt vmcnt(0) +; SPLIT-NEXT: v_mov_b32_e32 v2, v0 +; SPLIT-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_misaligned_v2: ; ALIGNED-GFX10: ; %bb.0: ; %bb ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; ALIGNED-GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX10-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v2: ; UNALIGNED-GFX10: ; %bb.0: ; %bb ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; UNALIGNED-GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX10-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v2: ; ALIGNED-GFX11: ; %bb.0: ; %bb ; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] -; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: global_load_b64 v[0:1], v3, s[0:1] +; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v2: ; UNALIGNED-GFX11: ; %bb.0: ; %bb ; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] -; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: global_load_b64 v[0:1], v3, s[0:1] +; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -330,92 +313,68 @@ define amdgpu_kernel void @test_flat_misaligned_v4(ptr %arg) { ; SPLIT-LABEL: test_flat_misaligned_v4: ; SPLIT: ; %bb.0: ; %bb ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SPLIT-NEXT: v_lshlrev_b32_e32 v7, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 12 -; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; SPLIT-NEXT: v_add_co_u32 v4, vcc_lo, v0, 4 -; SPLIT-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; SPLIT-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 -; SPLIT-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo -; SPLIT-NEXT: s_clause 0x3 -; SPLIT-NEXT: flat_load_dword v8, v[2:3] -; SPLIT-NEXT: flat_load_dword v9, v[4:5] -; SPLIT-NEXT: flat_load_dword v10, v[0:1] -; SPLIT-NEXT: flat_load_dword v11, v[6:7] -; SPLIT-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; SPLIT-NEXT: flat_store_dword v[6:7], v9 -; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) -; SPLIT-NEXT: flat_store_dword v[2:3], v10 -; SPLIT-NEXT: flat_store_dword v[0:1], v8 -; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) -; SPLIT-NEXT: flat_store_dword v[4:5], v11 +; SPLIT-NEXT: global_load_dwordx4 v[0:3], v7, s[0:1] +; SPLIT-NEXT: s_waitcnt vmcnt(0) +; SPLIT-NEXT: v_mov_b32_e32 v4, v2 +; SPLIT-NEXT: v_mov_b32_e32 v5, v1 +; SPLIT-NEXT: v_mov_b32_e32 v6, v0 +; SPLIT-NEXT: global_store_dwordx4 v7, v[3:6], s[0:1] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_misaligned_v4: ; ALIGNED-GFX10: ; %bb.0: ; %bb ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v7, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8] -; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX10-NEXT: global_load_dwordx4 v[0:3], v7, s[0:1] +; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 ; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1 ; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0 -; ALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; ALIGNED-GFX10-NEXT: global_store_dwordx4 v7, v[3:6], s[0:1] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v4: ; UNALIGNED-GFX10: ; %bb.0: ; %bb ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v7, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8] -; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: global_load_dwordx4 v[0:3], v7, s[0:1] +; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 ; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1 ; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; UNALIGNED-GFX10-NEXT: global_store_dwordx4 v7, v[3:6], s[0:1] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v4: ; ALIGNED-GFX11: ; %bb.0: ; %bb ; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8] -; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v2 +; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v7, 2, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: global_load_b128 v[0:3], v7, s[0:1] +; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0 -; ALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6] +; ALIGNED-GFX11-NEXT: global_store_b128 v7, v[3:6], s[0:1] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v4: ; UNALIGNED-GFX11: ; %bb.0: ; %bb ; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8] -; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v2 +; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v7, 2, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: global_load_b128 v[0:3], v7, s[0:1] +; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0 -; UNALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6] +; UNALIGNED-GFX11-NEXT: global_store_b128 v7, v[3:6], s[0:1] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -437,84 +396,63 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; SPLIT-LABEL: test_flat_misaligned_v3: ; SPLIT: ; %bb.0: ; %bb ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SPLIT-NEXT: v_lshlrev_b32_e32 v5, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 4 -; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; SPLIT-NEXT: v_add_co_u32 v4, vcc_lo, v0, 8 -; SPLIT-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; SPLIT-NEXT: s_clause 0x2 -; SPLIT-NEXT: flat_load_dword v6, v[2:3] -; SPLIT-NEXT: flat_load_dword v7, v[4:5] -; SPLIT-NEXT: flat_load_dword v8, v[0:1] -; SPLIT-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; SPLIT-NEXT: flat_store_dword v[4:5], v6 -; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) -; SPLIT-NEXT: flat_store_dword v[0:1], v7 -; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) -; SPLIT-NEXT: flat_store_dword v[2:3], v8 +; SPLIT-NEXT: global_load_dwordx3 v[0:2], v5, s[0:1] +; SPLIT-NEXT: s_waitcnt vmcnt(0) +; SPLIT-NEXT: v_mov_b32_e32 v3, v0 +; SPLIT-NEXT: v_mov_b32_e32 v4, v1 +; SPLIT-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_misaligned_v3: ; ALIGNED-GFX10: ; %bb.0: ; %bb ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v5, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] -; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX10-NEXT: global_load_dwordx3 v[0:2], v5, s[0:1] +; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 ; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] +; ALIGNED-GFX10-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v3: ; UNALIGNED-GFX10: ; %bb.0: ; %bb ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v5, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] -; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: global_load_dwordx3 v[0:2], v5, s[0:1] +; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 ; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] +; UNALIGNED-GFX10-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v3: ; ALIGNED-GFX11: ; %bb.0: ; %bb ; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] -; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; ALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] +; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v5, 2, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: global_load_b96 v[0:2], v5, s[0:1] +; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; ALIGNED-GFX11-NEXT: global_store_b96 v5, v[2:4], s[0:1] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v3: ; UNALIGNED-GFX11: ; %bb.0: ; %bb ; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] -; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; UNALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] +; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v5, 2, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: global_load_b96 v[0:2], v5, s[0:1] +; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; UNALIGNED-GFX11-NEXT: global_store_b96 v5, v[2:4], s[0:1] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -681,72 +619,60 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; SPLIT-LABEL: test_flat_aligned_v2: ; SPLIT: ; %bb.0: ; %bb ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SPLIT-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v4, v2 -; SPLIT-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; SPLIT-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; SPLIT-NEXT: s_waitcnt vmcnt(0) +; SPLIT-NEXT: v_mov_b32_e32 v2, v0 +; SPLIT-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_aligned_v2: ; ALIGNED-GFX10: ; %bb.0: ; %bb ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; ALIGNED-GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX10-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_aligned_v2: ; UNALIGNED-GFX10: ; %bb.0: ; %bb ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; UNALIGNED-GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX10-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_aligned_v2: ; ALIGNED-GFX11: ; %bb.0: ; %bb ; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] -; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: global_load_b64 v[0:1], v3, s[0:1] +; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_aligned_v2: ; UNALIGNED-GFX11: ; %bb.0: ; %bb ; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] -; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: global_load_b64 v[0:1], v3, s[0:1] +; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -764,80 +690,68 @@ define amdgpu_kernel void @test_flat_aligned_v4(ptr %arg) { ; SPLIT-LABEL: test_flat_aligned_v4: ; SPLIT: ; %bb.0: ; %bb ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SPLIT-NEXT: v_lshlrev_b32_e32 v7, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v7, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 -; SPLIT-NEXT: flat_load_dwordx4 v[0:3], v[7:8] -; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SPLIT-NEXT: global_load_dwordx4 v[0:3], v7, s[0:1] +; SPLIT-NEXT: s_waitcnt vmcnt(0) ; SPLIT-NEXT: v_mov_b32_e32 v4, v2 ; SPLIT-NEXT: v_mov_b32_e32 v5, v1 ; SPLIT-NEXT: v_mov_b32_e32 v6, v0 -; SPLIT-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; SPLIT-NEXT: global_store_dwordx4 v7, v[3:6], s[0:1] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_aligned_v4: ; ALIGNED-GFX10: ; %bb.0: ; %bb ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v7, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8] -; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX10-NEXT: global_load_dwordx4 v[0:3], v7, s[0:1] +; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 ; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1 ; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0 -; ALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; ALIGNED-GFX10-NEXT: global_store_dwordx4 v7, v[3:6], s[0:1] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_aligned_v4: ; UNALIGNED-GFX10: ; %bb.0: ; %bb ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v7, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8] -; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: global_load_dwordx4 v[0:3], v7, s[0:1] +; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 ; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1 ; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; UNALIGNED-GFX10-NEXT: global_store_dwordx4 v7, v[3:6], s[0:1] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_aligned_v4: ; ALIGNED-GFX11: ; %bb.0: ; %bb ; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8] -; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v2 +; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v7, 2, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: global_load_b128 v[0:3], v7, s[0:1] +; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0 -; ALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6] +; ALIGNED-GFX11-NEXT: global_store_b128 v7, v[3:6], s[0:1] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_aligned_v4: ; UNALIGNED-GFX11: ; %bb.0: ; %bb ; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8] -; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v2 +; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v7, 2, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: global_load_b128 v[0:3], v7, s[0:1] +; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0 -; UNALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6] +; UNALIGNED-GFX11-NEXT: global_store_b128 v7, v[3:6], s[0:1] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -940,87 +854,68 @@ define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) { ; SPLIT-LABEL: test_flat_v4_aligned8: ; SPLIT: ; %bb.0: ; %bb ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SPLIT-NEXT: v_lshlrev_b32_e32 v7, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 8 -; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; SPLIT-NEXT: s_clause 0x1 -; SPLIT-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; SPLIT-NEXT: flat_load_dwordx2 v[6:7], v[2:3] -; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; SPLIT-NEXT: v_mov_b32_e32 v8, v5 -; SPLIT-NEXT: v_mov_b32_e32 v9, v4 -; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v4, v7 -; SPLIT-NEXT: v_mov_b32_e32 v5, v6 -; SPLIT-NEXT: flat_store_dwordx2 v[2:3], v[8:9] -; SPLIT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; SPLIT-NEXT: global_load_dwordx4 v[0:3], v7, s[0:1] +; SPLIT-NEXT: s_waitcnt vmcnt(0) +; SPLIT-NEXT: v_mov_b32_e32 v4, v2 +; SPLIT-NEXT: v_mov_b32_e32 v5, v1 +; SPLIT-NEXT: v_mov_b32_e32 v6, v0 +; SPLIT-NEXT: global_store_dwordx4 v7, v[3:6], s[0:1] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_v4_aligned8: ; ALIGNED-GFX10: ; %bb.0: ; %bb ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v7, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8] -; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; ALIGNED-GFX10-NEXT: global_load_dwordx4 v[0:3], v7, s[0:1] +; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 ; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1 ; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0 -; ALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; ALIGNED-GFX10-NEXT: global_store_dwordx4 v7, v[3:6], s[0:1] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_v4_aligned8: ; UNALIGNED-GFX10: ; %bb.0: ; %bb ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v7, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8] -; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; UNALIGNED-GFX10-NEXT: global_load_dwordx4 v[0:3], v7, s[0:1] +; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 ; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1 ; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; UNALIGNED-GFX10-NEXT: global_store_dwordx4 v7, v[3:6], s[0:1] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_v4_aligned8: ; ALIGNED-GFX11: ; %bb.0: ; %bb ; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8] -; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v2 +; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v7, 2, v0 +; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-GFX11-NEXT: global_load_b128 v[0:3], v7, s[0:1] +; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0 -; ALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6] +; ALIGNED-GFX11-NEXT: global_store_b128 v7, v[3:6], s[0:1] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_v4_aligned8: ; UNALIGNED-GFX11: ; %bb.0: ; %bb ; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8] -; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v2 +; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v7, 2, v0 +; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED-GFX11-NEXT: global_load_b128 v[0:3], v7, s[0:1] +; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0 -; UNALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6] +; UNALIGNED-GFX11-NEXT: global_store_b128 v7, v[3:6], s[0:1] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll index a0db4ea8bc12a..3e4ac3358178d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -484,46 +484,51 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #0 { ; CI-LABEL: flat_atomic_dec_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: buffer_atomic_dec v0, off, s[8:11], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: buffer_atomic_dec v0, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out @@ -534,50 +539,51 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #0 ; CI-LABEL: flat_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 16 -; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: buffer_atomic_dec v0, off, s[8:11], 0 offset:16 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 16 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: buffer_atomic_dec v0, off, s[8:11], 0 offset:16 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[2:3] offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) @@ -589,36 +595,35 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) nounwind { ; CI-LABEL: flat_atomic_dec_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_dec v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_dec v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_atomic_dec v[0:1], v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false) @@ -629,40 +634,35 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) nounwind { ; CI-LABEL: flat_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_dec v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_dec v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 @@ -674,21 +674,19 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; CI-NEXT: v_mov_b32_e32 v3, 42 -; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[8:9], s[2:3] +; CI-NEXT: s_mov_b64 s[10:11], s[6:7] +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: buffer_atomic_dec v2, v[0:1], s[8:11], 0 addr64 offset:20 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_store_dword v[0:1], v3 +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: @@ -703,7 +701,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -714,19 +712,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 42 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_atomic_dec v3, v[0:1], v3 offset:20 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[2:3] offset:20 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id @@ -742,15 +734,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #0 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_atomic_dec v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 offset:20 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -766,7 +756,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_atomic_dec v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; @@ -774,13 +764,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:20 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -794,33 +781,41 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #0 { ; CI-LABEL: flat_atomic_dec_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s10, s6 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[8:11], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_ret_i64: @@ -828,15 +823,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #0 { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out @@ -847,37 +839,41 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #0 ; CI-LABEL: flat_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s10, s6 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 32 -; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[8:11], 0 offset:32 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 32 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[8:11], 0 offset:32 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: @@ -885,15 +881,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) @@ -906,12 +899,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) nounwind { ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -919,12 +912,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) nounwind { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; @@ -933,11 +926,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) nounwind { ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false) @@ -949,14 +941,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) nounwind { ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -964,14 +954,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) nounwind { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; @@ -980,11 +968,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) nounwind { ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 @@ -996,22 +983,20 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v2, vcc, s2, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[8:9], s[2:3] +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_mov_b64 s[10:11], s[6:7] +; CI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[8:11], 0 addr64 offset:40 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: @@ -1027,7 +1012,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 @@ -1038,20 +1023,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id @@ -1066,17 +1045,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #0 ; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v0 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:40 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -1093,22 +1070,19 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v0, v[1:2], s[0:1] offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll index 36b9ddac8ef41..7ab49dd9b1ed1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -1050,46 +1050,51 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #0 { ; CI-LABEL: flat_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: buffer_atomic_inc v0, off, s[8:11], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: buffer_atomic_inc v0, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out @@ -1100,50 +1105,51 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0 ; CI-LABEL: flat_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 16 -; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: buffer_atomic_inc v0, off, s[8:11], 0 offset:16 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 16 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: buffer_atomic_inc v0, off, s[8:11], 0 offset:16 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) @@ -1155,36 +1161,35 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) nounwind { ; CI-LABEL: flat_atomic_inc_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_inc v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_inc v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_atomic_inc v[0:1], v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false) @@ -1195,40 +1200,35 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind { ; CI-LABEL: flat_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_inc v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_inc v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 @@ -1240,21 +1240,19 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; CI-NEXT: v_mov_b32_e32 v3, 42 -; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[8:9], s[2:3] +; CI-NEXT: s_mov_b64 s[10:11], s[6:7] +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: buffer_atomic_inc v2, v[0:1], s[8:11], 0 addr64 offset:20 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_store_dword v[0:1], v3 +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: @@ -1269,7 +1267,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -1280,19 +1278,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 42 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_atomic_inc v3, v[0:1], v3 offset:20 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id @@ -1308,15 +1300,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_atomic_inc v[0:1], v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 offset:20 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -1332,7 +1322,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_atomic_inc v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; @@ -1340,13 +1330,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:20 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1432,33 +1419,41 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 { ; CI-LABEL: flat_atomic_inc_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s10, s6 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[8:11], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i64: @@ -1466,15 +1461,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out @@ -1485,37 +1477,41 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0 ; CI-LABEL: flat_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s10, s6 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 32 -; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s2 +; CI-NEXT: s_mov_b32 s9, s3 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[8:11], 0 offset:32 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 32 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[8:11], 0 offset:32 glc +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: @@ -1523,15 +1519,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) @@ -1544,12 +1537,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) nounwind { ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -1557,12 +1550,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) nounwind { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; @@ -1571,11 +1564,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) nounwind { ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false) @@ -1587,14 +1579,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind { ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -1602,14 +1592,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; @@ -1618,11 +1606,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind { ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 @@ -1634,22 +1621,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v2, vcc, s2, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[8:9], s[2:3] +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_mov_b64 s[10:11], s[6:7] +; CI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[8:11], 0 addr64 offset:40 glc +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: @@ -1665,7 +1650,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 @@ -1676,20 +1661,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id @@ -1704,17 +1683,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0 ; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v0 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:40 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; @@ -1731,22 +1708,19 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v0, v[1:2], s[0:1] offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 9606c68684957..21be5c2e47678 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -411,24 +411,21 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v2, s0, s8, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s9, 0, s0 -; GFX1013-NEXT: v_add_co_u32 v4, s0, s10, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s11, 0, s0 -; GFX1013-NEXT: flat_load_dword v0, v[2:3] -; GFX1013-NEXT: flat_load_dword v1, v[4:5] +; GFX1013-NEXT: s_clause 0x1 +; GFX1013-NEXT: global_load_dword v0, v2, s[8:9] +; GFX1013-NEXT: global_load_dword v1, v2, s[10:11] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 -; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[12:15] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] @@ -437,25 +434,22 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 -; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40c00000 -; GFX1030-NEXT: v_mov_b32_e32 v7, 0x40a00000 -; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 -; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 +; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 +; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 +; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000 +; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000 +; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; GFX1030-NEXT: v_add_co_u32 v2, s0, s2, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 -; GFX1030-NEXT: flat_load_dword v0, v[0:1] -; GFX1030-NEXT: flat_load_dword v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 -; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] +; GFX1030-NEXT: s_clause 0x1 +; GFX1030-NEXT: global_load_dword v1, v0, s[0:1] +; GFX1030-NEXT: global_load_dword v2, v0, s[2:3] +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[1:11], s[4:7] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm @@ -464,24 +458,18 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v1, 0x40e00000 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 -; GFX11-NEXT: flat_load_b32 v9, v[0:1] -; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v9, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v10, v0, s[2:3] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 -; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[4:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -491,25 +479,18 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x40e00000 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v8, 2.0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 4.0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 -; GFX12-SDAG-NEXT: flat_load_b32 v9, v[0:1] -; GFX12-SDAG-NEXT: flat_load_b32 v10, v[2:3] -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x40e00000 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_load_b32 v9, v0, s[0:1] +; GFX12-SDAG-NEXT: global_load_b32 v10, v0, s[2:3] ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x41000000 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000 -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[4:7] ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -523,30 +504,23 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0 ; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 ; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000 ; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s9 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v8, s14 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_load_b32 v9, v0, s[0:1] +; GFX12-GISEL-NEXT: global_load_b32 v10, v0, s[2:3] ; GFX12-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 ; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX12-GISEL-NEXT: flat_load_b32 v9, v[0:1] -; GFX12-GISEL-NEXT: flat_load_b32 v10, v[2:3] -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s10 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v7, s13 +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -575,21 +549,18 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 +; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v2, s0, s8, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s9, 0, s0 -; GFX1013-NEXT: v_add_co_u32 v4, s0, s10, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s11, 0, s0 -; GFX1013-NEXT: flat_load_dword v0, v[2:3] -; GFX1013-NEXT: flat_load_dword v1, v[4:5] +; GFX1013-NEXT: s_clause 0x1 +; GFX1013-NEXT: global_load_dword v0, v2, s[8:9] +; GFX1013-NEXT: global_load_dword v1, v2, s[10:11] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 -; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] @@ -598,22 +569,19 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 -; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700 +; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX1030-NEXT: v_mov_b32_e32 v3, 0 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 +; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 +; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; GFX1030-NEXT: v_add_co_u32 v2, s0, s2, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 -; GFX1030-NEXT: flat_load_dword v0, v[0:1] -; GFX1030-NEXT: flat_load_dword v1, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 -; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 +; GFX1030-NEXT: s_clause 0x1 +; GFX1030-NEXT: global_load_dword v1, v0, s[0:1] +; GFX1030-NEXT: global_load_dword v2, v0, s[2:3] +; GFX1030-NEXT: s_waitcnt vmcnt(0) +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[1:8], s[4:7] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm @@ -621,22 +589,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 +; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v1, 0x47004400 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 -; GFX11-NEXT: flat_load_b32 v6, v[0:1] -; GFX11-NEXT: flat_load_b32 v7, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 -; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v3, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v6, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v7, v0, s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v5, 2.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -645,23 +608,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX12-SDAG-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x48004500 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v1, 0x47004400 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 -; GFX12-SDAG-NEXT: flat_load_b32 v6, v[0:1] -; GFX12-SDAG-NEXT: flat_load_b32 v7, v[2:3] -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x47004400 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x46004200 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v3, 0 -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_load_b32 v6, v0, s[0:1] +; GFX12-SDAG-NEXT: global_load_b32 v7, v0, s[2:3] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v5, 2.0 +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -675,26 +632,18 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700 ; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s9 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_load_b32 v6, v0, s[0:1] +; GFX12-GISEL-NEXT: global_load_b32 v7, v0, s[2:3] ; GFX12-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 ; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX12-GISEL-NEXT: flat_load_b32 v6, v[0:1] -; GFX12-GISEL-NEXT: flat_load_b32 v7, v[2:3] -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s10 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -726,6 +675,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -736,12 +686,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v0, s4, s6, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4 -; GFX1013-NEXT: flat_load_dword v2, v[0:1] +; GFX1013-NEXT: global_load_dword v2, v0, s[6:7] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 -; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 -; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] @@ -753,7 +700,6 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000 @@ -762,13 +708,12 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_add_co_u32 v0, s4, s6, v0 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 -; GFX1030-NEXT: flat_load_dword v2, v[0:1] +; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX1030-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-NEXT: global_load_dword v2, v0, s[6:7] ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 -; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] @@ -781,20 +726,17 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 ; GFX11-NEXT: v_bfrev_b32_e32 v10, 4.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 -; GFX11-NEXT: flat_load_b32 v11, v[0:1] +; GFX11-NEXT: global_load_b32 v11, v0, s[6:7] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -807,20 +749,17 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x40e00000 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-SDAG-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 ; GFX12-SDAG-NEXT: v_bfrev_b32_e32 v10, 4.0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_add_co_u32 v0, s4, s6, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 -; GFX12-SDAG-NEXT: flat_load_b32 v11, v[0:1] +; GFX12-SDAG-NEXT: global_load_b32 v11, v0, s[6:7] ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x40e00000 -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3] ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -831,14 +770,14 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0 ; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v9, 0xb36211c7 ; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 ; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000 +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0 ; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000 ; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000 @@ -846,18 +785,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v10, 4.0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v5, s10 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX12-GISEL-NEXT: global_load_b32 v11, v0, s[6:7] ; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-GISEL-NEXT: flat_load_b32 v11, v[0:1] ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3] ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -887,6 +821,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -894,12 +829,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v0, s4, s6, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4 -; GFX1013-NEXT: flat_load_dword v2, v[0:1] +; GFX1013-NEXT: global_load_dword v2, v0, s[6:7] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 -; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] @@ -911,19 +843,17 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX1030-NEXT: v_mov_b32_e32 v3, 0 +; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_add_co_u32 v0, s4, s6, v0 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 -; GFX1030-NEXT: flat_load_dword v2, v[0:1] -; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX1030-NEXT: global_load_dword v2, v0, s[6:7] ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] @@ -936,20 +866,16 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 -; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v1, 0x47004400 :: v_dual_mov_b32 v4, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 ; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0 ; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 -; GFX11-NEXT: flat_load_b32 v8, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 +; GFX11-NEXT: global_load_b32 v8, v0, s[6:7] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -962,20 +888,16 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x48004500 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v6, 0xb36211c6 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x47004400 :: v_dual_mov_b32 v4, 1.0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v6, 0xb36211c6 ; GFX12-SDAG-NEXT: v_bfrev_b32_e32 v7, 4.0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_add_co_u32 v0, s4, s6, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 -; GFX12-SDAG-NEXT: flat_load_b32 v8, v[0:1] -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x47004400 +; GFX12-SDAG-NEXT: global_load_b32 v8, v0, s[6:7] ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x46004200 -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] @@ -986,28 +908,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0 ; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600 ; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v5, s10 ; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v7, 4.0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s8 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s9 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: global_load_b32 v8, v0, s[6:7] ; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-GISEL-NEXT: flat_load_b32 v8, v[0:1] ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 9e1815b48abfd..dad165ff3fadf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -116,111 +116,105 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; FIXME: setcc (zero_extend (setcc)), 1) not folded out, resulting in ; select and vcc branch. -define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { +define void @is_private_sgpr(ptr inreg %ptr) { ; SI-LABEL: is_private_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[8:9], 0x1 -; SI-NEXT: s_load_dword s1, s[8:9], 0x32 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], 0xc0 +; SI-NEXT: s_load_dword s4, s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s0, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; SI-NEXT: s_cmp_eq_u32 s17, s4 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB1_2 ; SI-NEXT: ; %bb.1: ; %bb0 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: .LBB1_2: ; %bb1 -; SI-NEXT: s_endpgm +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; CI-SDAG-LABEL: is_private_sgpr: ; CI-SDAG: ; %bb.0: -; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1 -; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x32 -; CI-SDAG-NEXT: s_add_i32 s12, s12, s17 -; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-SDAG-NEXT: s_mov_b64 s[4:5], 0xc0 +; CI-SDAG-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1 -; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CI-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CI-SDAG-NEXT: s_cmp_eq_u32 s17, s4 +; CI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CI-SDAG-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; CI-SDAG-NEXT: s_cbranch_vccnz .LBB1_2 ; CI-SDAG-NEXT: ; %bb.1: ; %bb0 ; CI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; CI-SDAG-NEXT: flat_store_dword v[0:1], v0 ; CI-SDAG-NEXT: s_waitcnt vmcnt(0) ; CI-SDAG-NEXT: .LBB1_2: ; %bb1 -; CI-SDAG-NEXT: s_endpgm +; CI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: is_private_sgpr: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dword s2, s[8:9], 0x4 -; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_cmp_eq_u32 s2, s1 -; GFX9-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX9-SDAG-NEXT: s_cmp_eq_u32 s17, s5 +; GFX9-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-SDAG-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-SDAG-NEXT: s_cbranch_vccnz .LBB1_2 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-SDAG-NEXT: global_store_dword v[0:1], v0, off ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: .LBB1_2: ; %bb1 -; GFX9-SDAG-NEXT: s_endpgm +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CI-GISEL-LABEL: is_private_sgpr: ; CI-GISEL: ; %bb.0: -; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x32 -; CI-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-GISEL-NEXT: s_mov_b64 s[4:5], 0xc0 +; CI-GISEL-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0 +; CI-GISEL-NEXT: s_cmp_lg_u32 s17, s4 ; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 ; CI-GISEL-NEXT: ; %bb.1: ; %bb0 ; CI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; CI-GISEL-NEXT: flat_store_dword v[0:1], v0 ; CI-GISEL-NEXT: s_waitcnt vmcnt(0) ; CI-GISEL-NEXT: .LBB1_2: ; %bb1 -; CI-GISEL-NEXT: s_endpgm +; CI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: is_private_sgpr: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s17, s5 ; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-GISEL-NEXT: ; %bb.1: ; %bb0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: global_store_dword v[0:1], v0, off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: .LBB1_2: ; %bb1 -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: is_private_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s1, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: s_cmp_lg_u32 s17, s5 ; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %bb0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: .LBB1_2: ; %bb1 -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: is_private_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s1, s3 ; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 @@ -228,7 +222,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB1_2: ; %bb1 -; GFX11-NEXT: s_endpgm +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call i1 @llvm.amdgcn.is.private(ptr %ptr) br i1 %val, label %bb0, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index c364c391559ea..cf8608fa23d30 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -149,7 +149,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; FIXME: setcc (zero_extend (setcc)), 1) not folded out, resulting in ; select and vcc branch. -define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { +define void @is_local_sgpr(ptr inreg %ptr) { ; CIT-LABEL: is_local_sgpr: ; CIT: ; %bb.0: ; CIT-NEXT: s_load_dword s0, s[6:7], 0x1 @@ -186,108 +186,102 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; SI-LABEL: is_local_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[8:9], 0x1 -; SI-NEXT: s_load_dword s1, s[8:9], 0x33 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], 0xc4 +; SI-NEXT: s_load_dword s4, s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s0, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; SI-NEXT: s_cmp_eq_u32 s17, s4 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB1_2 ; SI-NEXT: ; %bb.1: ; %bb0 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: .LBB1_2: ; %bb1 -; SI-NEXT: s_endpgm +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; CI-SDAG-LABEL: is_local_sgpr: ; CI-SDAG: ; %bb.0: -; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1 -; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x33 -; CI-SDAG-NEXT: s_add_i32 s12, s12, s17 -; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-SDAG-NEXT: s_mov_b64 s[4:5], 0xc4 +; CI-SDAG-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1 -; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CI-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CI-SDAG-NEXT: s_cmp_eq_u32 s17, s4 +; CI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CI-SDAG-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; CI-SDAG-NEXT: s_cbranch_vccnz .LBB1_2 ; CI-SDAG-NEXT: ; %bb.1: ; %bb0 ; CI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; CI-SDAG-NEXT: flat_store_dword v[0:1], v0 ; CI-SDAG-NEXT: s_waitcnt vmcnt(0) ; CI-SDAG-NEXT: .LBB1_2: ; %bb1 -; CI-SDAG-NEXT: s_endpgm +; CI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: is_local_sgpr: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dword s2, s[8:9], 0x4 -; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_cmp_eq_u32 s2, s1 -; GFX9-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9-SDAG-NEXT: s_cmp_eq_u32 s17, s5 +; GFX9-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-SDAG-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-SDAG-NEXT: s_cbranch_vccnz .LBB1_2 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-SDAG-NEXT: global_store_dword v[0:1], v0, off ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: .LBB1_2: ; %bb1 -; GFX9-SDAG-NEXT: s_endpgm +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CI-GISEL-LABEL: is_local_sgpr: ; CI-GISEL: ; %bb.0: -; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x33 -; CI-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-GISEL-NEXT: s_mov_b64 s[4:5], 0xc4 +; CI-GISEL-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0 +; CI-GISEL-NEXT: s_cmp_lg_u32 s17, s4 ; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 ; CI-GISEL-NEXT: ; %bb.1: ; %bb0 ; CI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; CI-GISEL-NEXT: flat_store_dword v[0:1], v0 ; CI-GISEL-NEXT: s_waitcnt vmcnt(0) ; CI-GISEL-NEXT: .LBB1_2: ; %bb1 -; CI-GISEL-NEXT: s_endpgm +; CI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: is_local_sgpr: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s17, s5 ; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-GISEL-NEXT: ; %bb.1: ; %bb0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: global_store_dword v[0:1], v0, off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: .LBB1_2: ; %bb1 -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: is_local_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: s_mov_b64 s[2:3], src_shared_base -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s1, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX10-NEXT: s_cmp_lg_u32 s17, s5 ; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %bb0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: .LBB1_2: ; %bb1 -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: is_local_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b64 s[2:3], src_shared_base -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s1, s3 ; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 @@ -295,7 +289,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB1_2: ; %bb1 -; GFX11-NEXT: s_endpgm +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr) br i1 %val, label %bb0, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll index 9e518589ac5b3..c5bb564e5b727 100644 --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -7,28 +7,24 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %ptr0, [8 x i32], ptr %ptr1, [8 x i32], ptr addrspace(1) %ptr2) { ; GCN-LABEL: select_ptr_crash_i64_flat: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[8:9], 0x0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x28 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x50 +; GCN-NEXT: s_load_dword s6, s[8:9], 0x0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x78 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NEXT: s_cmp_eq_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cselect_b32 s1, s1, s3 +; GCN-NEXT: s_cselect_b32 s0, s0, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_add_u32 s0, s0, 4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: flat_load_dword v0, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: flat_load_dword v1, v[1:2] -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm %tmp2 = icmp eq i32 %tmp, 0 diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 874dece6b728d..1973c54be6abb 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -11,20 +11,18 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX12-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-NEXT: .LBB0_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 ; GFX12-NEXT: s_add_co_i32 s6, s6, -1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 -; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: flat_store_b128 v[4:5], v[0:3] ; GFX12-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX12-NEXT: .LBB0_3: ; %for.end ; GFX12-NEXT: s_endpgm @@ -37,21 +35,19 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-SPREFETCH-NEXT: .LBB0_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe -; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SPREFETCH-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 ; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 -; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 -; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 +; GFX12-SPREFETCH-NEXT: s_wait_loadcnt 0x0 +; GFX12-SPREFETCH-NEXT: global_store_b128 v0, v[1:4], s[0:1] ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 -; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SPREFETCH-NEXT: flat_store_b128 v[4:5], v[0:3] ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX12-SPREFETCH-NEXT: .LBB0_3: ; %for.end ; GFX12-SPREFETCH-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 8d020b9e1a603..0f35923afff23 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -9,24 +9,20 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 ; CHECK-LABEL: memcpy_p0_p0_minsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: v_mov_b32_e32 v12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v12, s3 -; CHECK-NEXT: v_mov_b32_e32 v11, s2 -; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 -; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 -; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] -; CHECK-NEXT: v_mov_b32_e32 v12, s1 -; CHECK-NEXT: v_mov_b32_e32 v11, s0 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 -; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 -; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 -; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 -; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] +; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32 +; CHECK-NEXT: global_load_dwordx2 v[10:11], v12, s[2:3] offset:39 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] +; CHECK-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:39 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) @@ -165,62 +161,59 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_addc_u32 s21, s21, 0 +; CHECK-NEXT: v_mov_b32_e32 v25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v26, s0 -; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v25, s1 ; CHECK-NEXT: v_mov_b32_e32 v24, s0 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 -; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 -; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v24, s[20:23], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v24, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v24, s[20:23], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v24, s[20:23], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v24, s[20:23], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v24, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v24, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v24, s[20:23], 0 offen offset:96 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: buffer_load_dword v8, v24, s[20:23], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v24, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v24, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v24, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v24, s[20:23], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v24, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v24, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v24, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v24, s[20:23], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v24, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v24, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v24, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v24, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v24, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v24, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v24, s[20:23], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v25, v[0:3], s[0:1] offset:112 +; CHECK-NEXT: buffer_load_dword v3, v24, s[20:23], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v2, v24, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v24, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v24, s[20:23], 0 offen offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: global_store_dwordx4 v25, v[4:7], s[0:1] offset:96 +; CHECK-NEXT: buffer_load_dword v4, v24, s[20:23], 0 offen ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; CHECK-NEXT: buffer_load_dword v5, v24, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v24, s[20:23], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v24, s[20:23], 0 offen offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(10) +; CHECK-NEXT: global_store_dwordx4 v25, v[20:23], s[0:1] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: global_store_dwordx4 v25, v[0:3], s[0:1] offset:64 +; CHECK-NEXT: global_store_dwordx4 v25, v[16:19], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v25, v[12:15], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v25, v[8:11], s[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: global_store_dwordx4 v25, v[4:7], s[0:1] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) @@ -267,29 +260,28 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-LABEL: memcpy_p0_p3_minsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v16, 0 -; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 -; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 -; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 -; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: v_mov_b32_e32 v20, 0 +; CHECK-NEXT: ds_read2_b64 v[0:3], v20 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[4:7], v20 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[8:11], v20 offset0:4 offset1:5 +; CHECK-NEXT: ds_read2_b64 v[12:15], v20 offset0:6 offset1:7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v21, s1 -; CHECK-NEXT: v_mov_b32_e32 v20, s0 -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:16 -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 -; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 -; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 -; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 -; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 +; CHECK-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v20, v[4:7], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v20, v[8:11], s[0:1] offset:32 +; CHECK-NEXT: ds_read2_b64 v[0:3], v20 offset0:8 offset1:9 +; CHECK-NEXT: ds_read2_b64 v[4:7], v20 offset0:10 offset1:11 +; CHECK-NEXT: ds_read_b128 v[8:11], v20 offset:96 +; CHECK-NEXT: ds_read_b128 v[16:19], v20 offset:112 +; CHECK-NEXT: global_store_dwordx4 v20, v[12:15], s[0:1] offset:48 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1] offset:64 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: global_store_dwordx4 v20, v[4:7], s[0:1] offset:80 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v20, v[8:11], s[0:1] offset:96 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:80 -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[16:19] offset:112 +; CHECK-NEXT: global_store_dwordx4 v20, v[16:19], s[0:1] offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) @@ -300,24 +292,20 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK-LABEL: memcpy_p0_p0_optsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: v_mov_b32_e32 v12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v12, s3 -; CHECK-NEXT: v_mov_b32_e32 v11, s2 -; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 -; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 -; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] -; CHECK-NEXT: v_mov_b32_e32 v12, s1 -; CHECK-NEXT: v_mov_b32_e32 v11, s0 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 -; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 -; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 -; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 -; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] +; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32 +; CHECK-NEXT: global_load_dwordx2 v[10:11], v12, s[2:3] offset:39 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] +; CHECK-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:39 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) @@ -456,62 +444,59 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_addc_u32 s21, s21, 0 +; CHECK-NEXT: v_mov_b32_e32 v25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v26, s0 -; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v25, s1 ; CHECK-NEXT: v_mov_b32_e32 v24, s0 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 -; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 -; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v24, s[20:23], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v24, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v24, s[20:23], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v24, s[20:23], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v24, s[20:23], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v24, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v24, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v24, s[20:23], 0 offen offset:96 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: buffer_load_dword v8, v24, s[20:23], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v24, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v24, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v24, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v24, s[20:23], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v24, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v24, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v24, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v24, s[20:23], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v24, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v24, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v24, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v24, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v24, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v24, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v24, s[20:23], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v25, v[0:3], s[0:1] offset:112 +; CHECK-NEXT: buffer_load_dword v3, v24, s[20:23], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v2, v24, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v24, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v24, s[20:23], 0 offen offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: global_store_dwordx4 v25, v[4:7], s[0:1] offset:96 +; CHECK-NEXT: buffer_load_dword v4, v24, s[20:23], 0 offen ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; CHECK-NEXT: buffer_load_dword v5, v24, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v24, s[20:23], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v24, s[20:23], 0 offen offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(10) +; CHECK-NEXT: global_store_dwordx4 v25, v[20:23], s[0:1] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: global_store_dwordx4 v25, v[0:3], s[0:1] offset:64 +; CHECK-NEXT: global_store_dwordx4 v25, v[16:19], s[0:1] offset:48 +; CHECK-NEXT: global_store_dwordx4 v25, v[12:15], s[0:1] offset:32 +; CHECK-NEXT: global_store_dwordx4 v25, v[8:11], s[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: global_store_dwordx4 v25, v[4:7], s[0:1] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) @@ -558,29 +543,28 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-LABEL: memcpy_p0_p3_optsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v16, 0 -; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 -; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 -; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 -; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: v_mov_b32_e32 v20, 0 +; CHECK-NEXT: ds_read2_b64 v[0:3], v20 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[4:7], v20 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[8:11], v20 offset0:4 offset1:5 +; CHECK-NEXT: ds_read2_b64 v[12:15], v20 offset0:6 offset1:7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v21, s1 -; CHECK-NEXT: v_mov_b32_e32 v20, s0 -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:16 -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 -; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 -; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 -; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 -; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 +; CHECK-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1] +; CHECK-NEXT: global_store_dwordx4 v20, v[4:7], s[0:1] offset:16 +; CHECK-NEXT: global_store_dwordx4 v20, v[8:11], s[0:1] offset:32 +; CHECK-NEXT: ds_read2_b64 v[0:3], v20 offset0:8 offset1:9 +; CHECK-NEXT: ds_read2_b64 v[4:7], v20 offset0:10 offset1:11 +; CHECK-NEXT: ds_read_b128 v[8:11], v20 offset:96 +; CHECK-NEXT: ds_read_b128 v[16:19], v20 offset:112 +; CHECK-NEXT: global_store_dwordx4 v20, v[12:15], s[0:1] offset:48 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1] offset:64 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: global_store_dwordx4 v20, v[4:7], s[0:1] offset:80 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v20, v[8:11], s[0:1] offset:96 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:80 -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[16:19] offset:112 +; CHECK-NEXT: global_store_dwordx4 v20, v[16:19], s[0:1] offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll index a476a5830ffad..4d26ea3aa909a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll @@ -15,17 +15,12 @@ define amdgpu_kernel void @_Z6brokenPd(ptr %arg) { ; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s4, 4 -; GCN-NEXT: s_addc_u32 s5, s5, 0 -; GCN-NEXT: flat_store_dword v[0:1], v2 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: flat_store_dword v[0:1], v3 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm bb: %tmp = alloca double, align 8, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index 61ac1fe92c278..3ec90c9aecd2c 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -2752,16 +2752,16 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { } define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { -; GFX9-LABEL: flat_inst_salu_offset_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_byte v[0:1], v0 -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: flat_inst_salu_offset_1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:1 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: flat_store_byte v[0:1], v0 +; GFX9-SDAG-NEXT: s_endpgm ; ; GFX10-LABEL: flat_inst_salu_offset_1: ; GFX10: ; %bb.0: @@ -2816,12 +2816,28 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; +; GFX9-GISEL-LABEL: flat_inst_salu_offset_1: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 1 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: flat_store_byte v[0:1], v0 +; GFX9-GISEL-NEXT: s_endpgm +; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_1: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 1 +; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:1 glc dlc +; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm @@ -2830,8 +2846,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 1 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm @@ -2842,16 +2861,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { } define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { -; GFX9-LABEL: flat_inst_salu_offset_11bit_max: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_byte v[0:1], v0 -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: flat_inst_salu_offset_11bit_max: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: flat_store_byte v[0:1], v0 +; GFX9-SDAG-NEXT: s_endpgm ; ; GFX10-LABEL: flat_inst_salu_offset_11bit_max: ; GFX10: ; %bb.0: @@ -2906,12 +2925,28 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; +; GFX9-GISEL-LABEL: flat_inst_salu_offset_11bit_max: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: flat_store_byte v[0:1], v0 +; GFX9-GISEL-NEXT: s_endpgm +; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_11bit_max: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff +; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc +; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm @@ -2920,8 +2955,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm @@ -2932,16 +2970,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { } define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { -; GFX9-LABEL: flat_inst_salu_offset_12bit_max: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_byte v[0:1], v0 -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: flat_inst_salu_offset_12bit_max: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: flat_store_byte v[0:1], v0 +; GFX9-SDAG-NEXT: s_endpgm ; ; GFX10-LABEL: flat_inst_salu_offset_12bit_max: ; GFX10: ; %bb.0: @@ -2996,12 +3034,28 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; +; GFX9-GISEL-LABEL: flat_inst_salu_offset_12bit_max: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: flat_store_byte v[0:1], v0 +; GFX9-GISEL-NEXT: s_endpgm +; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_12bit_max: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff +; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc +; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm @@ -3010,8 +3064,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm @@ -3122,8 +3179,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm @@ -3234,8 +3294,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfffff800 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm @@ -3346,8 +3409,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfffff000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm @@ -3458,8 +3524,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffe000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm @@ -3470,16 +3539,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { } define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { -; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_byte v[0:1], v0 -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_11bit_max: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: flat_store_byte v[0:1], v0 +; GFX9-SDAG-NEXT: s_endpgm ; ; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX10: ; %bb.0: @@ -3534,12 +3603,28 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; +; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_11bit_max: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: flat_store_byte v[0:1], v0 +; GFX9-GISEL-NEXT: s_endpgm +; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff +; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc +; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm @@ -3548,8 +3633,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm @@ -3660,8 +3748,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm @@ -3772,8 +3863,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x3fff +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm @@ -3884,8 +3978,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfffff000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm @@ -3996,8 +4093,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffe000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm @@ -4108,8 +4208,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffc000 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll index 58f0b9657476c..d6730dbbdb6f4 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll @@ -12,8 +12,9 @@ define amdgpu_kernel void @preload_ptr_kernarg_header(ptr inreg %arg) { ; ASM-NEXT: s_branch .LBB0_0 ; ASM-NEXT: .p2align 8 ; ASM-NEXT: .LBB0_0: +; ASM-NEXT: v_mov_b32_e32 v2, 0 ; ASM-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; ASM-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; ASM-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; ASM-NEXT: s_endpgm store ptr %arg, ptr %arg ret void @@ -29,9 +30,9 @@ define amdgpu_kernel void @preload_i32_kernarg_header(ptr inreg %arg, i32 inreg ; ASM-NEXT: s_branch .LBB1_0 ; ASM-NEXT: .p2align 8 ; ASM-NEXT: .LBB1_0: -; ASM-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; ASM-NEXT: v_mov_b32_e32 v2, s10 -; ASM-NEXT: flat_store_dword v[0:1], v2 +; ASM-NEXT: v_mov_b32_e32 v0, 0 +; ASM-NEXT: v_mov_b32_e32 v1, s10 +; ASM-NEXT: global_store_dword v0, v1, s[8:9] ; ASM-NEXT: s_endpgm store i32 %arg1, ptr %arg ret void diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll index 29448ab2d822e..826d737d68ae3 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -8,52 +8,37 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX900-LABEL: scalar_to_vector_v8i16: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, s3 -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 -; GFX900-NEXT: v_mov_b32_e32 v0, s0 -; GFX900-NEXT: v_mov_b32_e32 v1, s1 -; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_mov_b32_e32 v1, s0 +; GFX900-NEXT: v_mov_b32_e32 v2, s1 ; GFX900-NEXT: v_mov_b32_e32 v3, s0 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX900-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX900-NEXT: v_mov_b32_e32 v4, s0 +; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GFX900-NEXT: s_endpgm ; ; GFX906-LABEL: scalar_to_vector_v8i16: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v5, s3 -; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 -; GFX906-NEXT: v_mov_b32_e32 v0, s0 -; GFX906-NEXT: v_mov_b32_e32 v1, s1 -; GFX906-NEXT: v_mov_b32_e32 v2, s0 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: v_mov_b32_e32 v2, s1 ; GFX906-NEXT: v_mov_b32_e32 v3, s0 -; GFX906-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX906-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX906-NEXT: v_mov_b32_e32 v4, s0 +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX908-LABEL: scalar_to_vector_v8i16: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, s3 -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 -; GFX908-NEXT: v_mov_b32_e32 v0, s0 -; GFX908-NEXT: v_mov_b32_e32 v1, s1 -; GFX908-NEXT: v_mov_b32_e32 v2, s0 +; GFX908-NEXT: v_mov_b32_e32 v1, s0 +; GFX908-NEXT: v_mov_b32_e32 v2, s1 ; GFX908-NEXT: v_mov_b32_e32 v3, s0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX908-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX908-NEXT: v_mov_b32_e32 v4, s0 +; GFX908-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: scalar_to_vector_v8i16: @@ -61,17 +46,12 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, s3 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90A-NEXT: s_endpgm entry: %val.1.i32 = extractelement <2 x i32> %in, i64 0 @@ -95,52 +75,37 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX900-LABEL: scalar_to_vector_v8f16: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, s3 -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 -; GFX900-NEXT: v_mov_b32_e32 v0, s0 -; GFX900-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-NEXT: v_mov_b32_e32 v1, s0 +; GFX900-NEXT: v_mov_b32_e32 v2, s1 +; GFX900-NEXT: v_mov_b32_e32 v4, s0 ; GFX900-NEXT: v_mov_b32_e32 v3, s0 -; GFX900-NEXT: v_mov_b32_e32 v2, s0 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX900-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GFX900-NEXT: s_endpgm ; ; GFX906-LABEL: scalar_to_vector_v8f16: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v5, s3 -; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 -; GFX906-NEXT: v_mov_b32_e32 v0, s0 -; GFX906-NEXT: v_mov_b32_e32 v1, s1 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: v_mov_b32_e32 v2, s1 +; GFX906-NEXT: v_mov_b32_e32 v4, s0 ; GFX906-NEXT: v_mov_b32_e32 v3, s0 -; GFX906-NEXT: v_mov_b32_e32 v2, s0 -; GFX906-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX906-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX908-LABEL: scalar_to_vector_v8f16: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, s3 -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 -; GFX908-NEXT: v_mov_b32_e32 v0, s0 -; GFX908-NEXT: v_mov_b32_e32 v1, s1 +; GFX908-NEXT: v_mov_b32_e32 v1, s0 +; GFX908-NEXT: v_mov_b32_e32 v2, s1 +; GFX908-NEXT: v_mov_b32_e32 v4, s0 ; GFX908-NEXT: v_mov_b32_e32 v3, s0 -; GFX908-NEXT: v_mov_b32_e32 v2, s0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX908-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX908-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: scalar_to_vector_v8f16: @@ -148,17 +113,12 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, s3 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90A-NEXT: s_endpgm entry: %val.1.float = extractelement <2 x float> %in, i64 0 diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index 61da875cf2f28..fc66fddce5da5 100644 --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -12,18 +12,18 @@ define amdgpu_kernel void @no_reorder_flat_load_local_store_local_load(ptr addrs ; CI-LABEL: no_reorder_flat_load_local_store_local_load: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_load_dword s0, s[4:5], 0x9 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: ds_write_b128 v4, v[0:3] offset:512 ; CI-NEXT: ds_read2_b32 v[0:1], v4 offset0:129 offset1:130 -; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; CI-NEXT: s_endpgm ; @@ -32,15 +32,13 @@ define amdgpu_kernel void @no_reorder_flat_load_local_store_local_load(ptr addrs ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write_b128 v4, v[0:3] offset:512 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:129 offset1:130 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX9-NEXT: s_endpgm %ptr1 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll index a376262e6d539..b948e298bd4fe 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -190,7 +190,7 @@ define amdgpu_kernel void @barrier_vmcnt_flat(ptr %arg) { ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_barrier ; GFX8-NEXT: flat_store_dword v[0:1], v4 ; GFX8-NEXT: s_endpgm @@ -199,18 +199,16 @@ define amdgpu_kernel void @barrier_vmcnt_flat(ptr %arg) { ; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, v1, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc -; GFX9-NEXT: flat_load_dword v4, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: flat_store_dword v[0:1], v4 +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -244,7 +242,7 @@ define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) { ; GFX8-NEXT: v_mov_b32_e32 v3, 1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_barrier ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -258,15 +256,15 @@ define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) { ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: flat_store_dword v[2:3], v1 +; GFX9-NEXT: global_store_dword v[2:3], v1, off ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -305,7 +303,7 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) { ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_barrier ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -319,18 +317,16 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) { ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: flat_store_dword v[2:3], v1 +; GFX9-NEXT: global_store_dword v[2:3], v1, off ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: flat_load_dword v3, v[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -369,7 +365,6 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) { ; GFX8-NEXT: flat_load_dword v3, v[2:3] ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -386,19 +381,16 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) { ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: flat_store_dword v[2:3], v1 +; GFX9-NEXT: global_store_dword v[2:3], v1, off ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: flat_load_dword v3, v[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -479,7 +471,7 @@ define amdgpu_kernel void @load_vmcnt_flat(ptr %arg) { ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dword v[0:1], v4 ; GFX8-NEXT: s_endpgm ; @@ -487,17 +479,15 @@ define amdgpu_kernel void @load_vmcnt_flat(ptr %arg) { ; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, v1, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc -; GFX9-NEXT: flat_load_dword v4, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_dword v[0:1], v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll index 99fe986cf6378..60bb38f863e8e 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll @@ -66,7 +66,9 @@ define amdgpu_kernel void @store_global_from_flat(ptr %generic_scalar) #0 { define amdgpu_kernel void @store_group_from_flat(ptr %generic_scalar) #0 { ; CHECK-LABEL: define amdgpu_kernel void @store_group_from_flat( ; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(3) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(1) [[TMP1]] to ptr +; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[TMP2]] to ptr addrspace(3) ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(3) [[_TMP0]], align 4 ; CHECK-NEXT: ret void ; @@ -78,7 +80,9 @@ define amdgpu_kernel void @store_group_from_flat(ptr %generic_scalar) #0 { define amdgpu_kernel void @store_private_from_flat(ptr %generic_scalar) #0 { ; CHECK-LABEL: define amdgpu_kernel void @store_private_from_flat( ; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(1) [[TMP1]] to ptr +; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[TMP2]] to ptr addrspace(5) ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[_TMP0]], align 4 ; CHECK-NEXT: ret void ; @@ -136,8 +140,10 @@ define amdgpu_kernel void @load_store_private(ptr addrspace(5) nocapture %input, define amdgpu_kernel void @load_store_flat(ptr nocapture %input, ptr nocapture %output) #0 { ; CHECK-LABEL: define amdgpu_kernel void @load_store_flat( ; CHECK-SAME: ptr captures(none) [[INPUT:%.*]], ptr captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[INPUT]], align 4 -; CHECK-NEXT: store i32 [[VAL]], ptr [[OUTPUT]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[TMP2]], align 4 ; CHECK-NEXT: ret void ; %val = load i32, ptr %input, align 4 diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index 39af91b81110d..8fa8a1f4cf2b7 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -4,21 +4,28 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-LABEL: InferNothing: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x2c +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s7, s6, 31 -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 -; CHECK-NEXT: s_add_u32 s0, s2, s0 -; CHECK-NEXT: s_addc_u32 s1, s3, s1 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc -; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; CHECK-NEXT: s_add_u32 s2, s8, s2 +; CHECK-NEXT: s_addc_u32 s3, s9, s3 +; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; CHECK-NEXT: v_mul_f64 v[0:1], s[10:11], v[0:1] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] offset:-8 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 @@ -66,35 +73,47 @@ entry: define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, double %c, ptr %d) { ; CHECK-LABEL: InferMixed: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; CHECK-NEXT: s_mov_b64 s[6:7], exec +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; CHECK-NEXT: s_mov_b64 s[8:9], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[10:11], vcc +; CHECK-NEXT: s_cbranch_execz .LBB2_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_bcnt1_i32_b64 s8, s[8:9] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s8 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_mov_b64 s[6:7], exec ; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc -; CHECK-NEXT: s_cbranch_execz .LBB2_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execz .LBB2_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x24 +; CHECK-NEXT: s_bcnt1_i32_b64 s5, s[6:7] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s5 +; CHECK-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_ashr_i32 s5, s4, 31 -; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CHECK-NEXT: s_add_u32 s0, s0, s4 -; CHECK-NEXT: s_addc_u32 s1, s1, s5 -; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[6:7] -; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 -; CHECK-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1] +; CHECK-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; CHECK-NEXT: s_add_u32 s0, s0, s2 +; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll index 48becdeba1c6a..0844f6683527f 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll @@ -48,7 +48,8 @@ define amdgpu_kernel void @memset_global_to_flat_no_md(ptr addrspace(1) %global. define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { ; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group( ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] ; CHECK-NEXT: ret void ; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr @@ -59,7 +60,8 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(ptr %dest, define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr) #0 { ; CHECK-LABEL: define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group( ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 42, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memcpy.inline.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 42, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] ; CHECK-NEXT: ret void ; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr @@ -70,7 +72,8 @@ define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(ptr define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(ptr addrspace(3) %dest.group.ptr, ptr %src.ptr, i64 %size) #0 { ; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group( ; CHECK-SAME: ptr addrspace(3) [[DEST_GROUP_PTR:%.*]], ptr [[SRC_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr align 4 [[SRC_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[SRC_PTR]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr addrspace(1) align 4 [[TMP1]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] ; CHECK-NEXT: ret void ; %cast.dest = addrspacecast ptr addrspace(3) %dest.group.ptr to ptr @@ -116,7 +119,8 @@ define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(ptr addrspac define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { ; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct( ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa.struct [[TBAA_STRUCT8:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa.struct [[TBAA_STRUCT8:![0-9]+]] ; CHECK-NEXT: ret void ; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr @@ -127,7 +131,8 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struc define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { ; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md( ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) ; CHECK-NEXT: ret void ; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr @@ -138,8 +143,10 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(ptr define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(ptr %dest0, ptr %dest1, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { ; CHECK-LABEL: define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md( ; CHECK-SAME: ptr [[DEST0:%.*]], ptr [[DEST1:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST0]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) -; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST0]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DEST1]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) ; CHECK-NEXT: ret void ; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr @@ -162,7 +169,8 @@ define amdgpu_kernel void @memcpy_group_flat_to_flat_self(ptr addrspace(3) %grou define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { ; CHECK-LABEL: define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group( ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memmove.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] ; CHECK-NEXT: ret void ; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll index 5b7544b1a7961..ade776d109067 100644 --- a/llvm/test/Transforms/OpenMP/barrier_removal.ll +++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll @@ -682,11 +682,18 @@ m: } define internal void @write_then_barrier0(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@write_then_barrier0 -; CHECK-SAME: (ptr [[P:%.*]]) { -; CHECK-NEXT: store i32 0, ptr [[P]], align 4 -; CHECK-NEXT: call void @aligned_barrier() -; CHECK-NEXT: ret void +; MODULE-LABEL: define {{[^@]+}}@write_then_barrier0 +; MODULE-SAME: (ptr [[P:%.*]]) { +; MODULE-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) +; MODULE-NEXT: store i32 0, ptr addrspace(1) [[TMP1]], align 4 +; MODULE-NEXT: call void @aligned_barrier() +; MODULE-NEXT: ret void +; +; CGSCC-LABEL: define {{[^@]+}}@write_then_barrier0 +; CGSCC-SAME: (ptr [[P:%.*]]) { +; CGSCC-NEXT: store i32 0, ptr [[P]], align 4 +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: ret void ; store i32 0, ptr %p call void @aligned_barrier() @@ -695,7 +702,8 @@ define internal void @write_then_barrier0(ptr %p) { define internal void @barrier_then_write0(ptr %p) { ; MODULE-LABEL: define {{[^@]+}}@barrier_then_write0 ; MODULE-SAME: (ptr [[P:%.*]]) { -; MODULE-NEXT: store i32 0, ptr [[P]], align 4 +; MODULE-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) +; MODULE-NEXT: store i32 0, ptr addrspace(1) [[TMP1]], align 4 ; MODULE-NEXT: ret void ; ; CGSCC-LABEL: define {{[^@]+}}@barrier_then_write0 @@ -711,7 +719,8 @@ define internal void @barrier_then_write0(ptr %p) { define internal void @barrier_then_write_then_barrier0(ptr %p) { ; MODULE-LABEL: define {{[^@]+}}@barrier_then_write_then_barrier0 ; MODULE-SAME: (ptr [[P:%.*]]) { -; MODULE-NEXT: store i32 0, ptr [[P]], align 4 +; MODULE-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) +; MODULE-NEXT: store i32 0, ptr addrspace(1) [[TMP1]], align 4 ; MODULE-NEXT: call void @aligned_barrier() ; MODULE-NEXT: ret void ; diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll index 2f1aadc073142..81e11e048dfd0 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll @@ -85,8 +85,10 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; CHECK: region.guarded: ; CHECK-NEXT: store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]] -; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META7]] -; CHECK-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META7]] +; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[ARRAYIDX1_I]] to ptr addrspace(1) +; CHECK-NEXT: store i32 1, ptr addrspace(1) [[TMP4]], align 4, !noalias [[META7]] +; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[ARRAYIDX2_I]] to ptr addrspace(1) +; CHECK-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspace(1) [[TMP5]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]] ; CHECK: region.guarded.end: ; CHECK-NEXT: br label [[REGION_BARRIER]] @@ -107,16 +109,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID5:%.*]] ; CHECK: region.check.tid5: -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]] ; CHECK: region.guarded4: -; CHECK-NEXT: store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META7]] +; CHECK-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[ARRAYIDX5_I]] to ptr addrspace(1) +; CHECK-NEXT: store i32 [[SUB3_I]], ptr addrspace(1) [[TMP8]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END1:%.*]] ; CHECK: region.guarded.end1: ; CHECK-NEXT: br label [[REGION_BARRIER2]] ; CHECK: region.barrier2: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP4]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP6]]) ; CHECK-NEXT: br label [[REGION_EXIT3]] ; CHECK: region.exit3: ; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 @@ -128,16 +131,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID10:%.*]] ; CHECK: region.check.tid10: -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 -; CHECK-NEXT: br i1 [[TMP7]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]] ; CHECK: region.guarded9: -; CHECK-NEXT: store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META7]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[ARRAYIDX7_I]] to ptr addrspace(1) +; CHECK-NEXT: store i32 [[CALL_I]], ptr addrspace(1) [[TMP11]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END6:%.*]] ; CHECK: region.guarded.end6: ; CHECK-NEXT: br label [[REGION_BARRIER7]] ; CHECK: region.barrier7: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP6]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP9]]) ; CHECK-NEXT: br label [[REGION_EXIT8:%.*]] ; CHECK: region.exit8: ; CHECK-NEXT: [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] @@ -145,16 +149,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID15:%.*]] ; CHECK: region.check.tid15: -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]] ; CHECK: region.guarded14: -; CHECK-NEXT: store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META7]] +; CHECK-NEXT: [[TMP14:%.*]] = addrspacecast ptr [[ARRAYIDX10_I]] to ptr addrspace(1) +; CHECK-NEXT: store i32 [[CALL8_I]], ptr addrspace(1) [[TMP14]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END11:%.*]] ; CHECK: region.guarded.end11: ; CHECK-NEXT: br label [[REGION_BARRIER12]] ; CHECK: region.barrier12: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP8]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP12]]) ; CHECK-NEXT: br label [[REGION_EXIT13:%.*]] ; CHECK: region.exit13: ; CHECK-NEXT: [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] @@ -162,16 +167,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID20:%.*]] ; CHECK: region.check.tid20: -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0 -; CHECK-NEXT: br i1 [[TMP11]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]] ; CHECK: region.guarded19: -; CHECK-NEXT: store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META7]] +; CHECK-NEXT: [[TMP17:%.*]] = addrspacecast ptr [[ARRAYIDX13_I]] to ptr addrspace(1) +; CHECK-NEXT: store i32 [[CALL11_I]], ptr addrspace(1) [[TMP17]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END16:%.*]] ; CHECK: region.guarded.end16: ; CHECK-NEXT: br label [[REGION_BARRIER17]] ; CHECK: region.barrier17: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP10]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP15]]) ; CHECK-NEXT: br label [[REGION_EXIT18:%.*]] ; CHECK: region.exit18: ; CHECK-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] @@ -232,11 +238,13 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] ; CHECK-DISABLED-NEXT: store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]] ; CHECK-DISABLED-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1 -; CHECK-DISABLED-NEXT: store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[ARRAYIDX1_I]] to ptr addrspace(1) +; CHECK-DISABLED-NEXT: store i32 1, ptr addrspace(1) [[TMP2]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[SEXT:%.*]] = shl i64 [[N]], 32 ; CHECK-DISABLED-NEXT: [[IDXPROM_I:%.*]] = ashr exact i64 [[SEXT]], 32 ; CHECK-DISABLED-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM_I]] -; CHECK-DISABLED-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[ARRAYIDX2_I]] to ptr addrspace(1) +; CHECK-DISABLED-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspace(1) [[TMP3]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: call void @usei8ptr(ptr captures(none) [[HEAP2STACK_H2S]]) #[[ATTR9:[0-9]+]] ; CHECK-DISABLED-NEXT: br label [[FOR_COND_I:%.*]] ; CHECK-DISABLED: for.cond.i: @@ -248,7 +256,8 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-DISABLED-NEXT: [[SUB3_I:%.*]] = add nsw i32 [[I_0_I]], -1 ; CHECK-DISABLED-NEXT: [[IDXPROM4_I:%.*]] = zext i32 [[I_0_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]] -; CHECK-DISABLED-NEXT: store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[ARRAYIDX5_I]] to ptr addrspace(1) +; CHECK-DISABLED-NEXT: store i32 [[SUB3_I]], ptr addrspace(1) [[TMP4]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 ; CHECK-DISABLED-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-DISABLED: __omp_outlined__.exit: @@ -256,15 +265,18 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-DISABLED-NEXT: [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[IDXPROM6_I:%.*]] = sext i32 [[CALL_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]] -; CHECK-DISABLED-NEXT: store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[ARRAYIDX7_I]] to ptr addrspace(1) +; CHECK-DISABLED-NEXT: store i32 [[CALL_I]], ptr addrspace(1) [[TMP5]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[IDXPROM9_I:%.*]] = sext i32 [[CALL8_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]] -; CHECK-DISABLED-NEXT: store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP6:%.*]] = addrspacecast ptr [[ARRAYIDX10_I]] to ptr addrspace(1) +; CHECK-DISABLED-NEXT: store i32 [[CALL8_I]], ptr addrspace(1) [[TMP6]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[IDXPROM12_I:%.*]] = sext i32 [[CALL11_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]] -; CHECK-DISABLED-NEXT: store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP7:%.*]] = addrspacecast ptr [[ARRAYIDX13_I]] to ptr addrspace(1) +; CHECK-DISABLED-NEXT: store i32 [[CALL11_I]], ptr addrspace(1) [[TMP7]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]