Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 2973ed7

Browse files
committed
fix vgpr16 copy to sgpr32
1 parent 0b2ab11 commit 2973ed7

File tree

4 files changed

+121
-12
lines changed

4 files changed

+121
-12
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

+16-4
Original file line numberDiff line numberDiff line change
@@ -1086,10 +1086,22 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10861086
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
10871087
size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
10881088
if (SrcSize == 16) {
1089-
// HACK to handle possible 16bit VGPR source
1090-
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
1091-
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
1092-
MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
1089+
assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
1090+
"We do not expect to see 16-bit copies from VGPR to SGPR unless "
1091+
"we have 16-bit VGPRs");
1092+
assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
1093+
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
1094+
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
1095+
// There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
1096+
MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
1097+
Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1098+
const DebugLoc &DL = MI->getDebugLoc();
1099+
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::SUBREG_TO_REG), VReg32)
1100+
.addImm(0)
1101+
.addReg(SrcReg, 0)
1102+
.addImm(AMDGPU::lo16);
1103+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
1104+
.addReg(VReg32);
10931105
} else if (SrcSize == 32) {
10941106
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
10951107
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);

llvm/lib/Target/AMDGPU/VOP1Instructions.td

+1-8
Original file line numberDiff line numberDiff line change
@@ -1472,16 +1472,9 @@ def : GCNPat <
14721472

14731473
} // End OtherPredicates = [isGFX8Plus, p]
14741474

1475-
let True16Predicate = UseFakeTrue16Insts in {
1476-
def : GCNPat<
1477-
(i32 (DivergentUnaryFrag<anyext> i16:$src)),
1478-
(COPY $src)
1479-
>;
1480-
} // End True16Predicate = UseFakeTrue16Insts
1481-
14821475
let True16Predicate = UseRealTrue16Insts in {
14831476
def : GCNPat<
1484-
(i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
1477+
(i32 (UniformUnaryFrag<anyext> i16:$src)),
14851478
(COPY $src)
14861479
>;
14871480

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir

+63
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,66 @@ body: |
5353
%3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc
5454
%4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
5555
...
56+
57+
---
58+
name: vgpr16_to_spgr32
59+
body: |
60+
; GCN-LABEL: name: vgpr16_to_spgr32
61+
; GCN: bb.0.entry:
62+
; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
63+
; GCN-NEXT: {{ $}}
64+
; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
65+
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
66+
; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 killed [[COPY]], 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3)
67+
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0
68+
; GCN-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
69+
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
70+
; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16
71+
; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[SUBREG_TO_REG]], implicit $exec
72+
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_MOV_B32_]], killed [[V_READFIRSTLANE_B32_]], implicit-def dead $scc
73+
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
74+
; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]]
75+
; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
76+
; GCN-NEXT: S_CMP_LG_U32 killed [[S_MUL_I32_]], killed [[S_MOV_B32_2]], implicit-def $scc
77+
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
78+
; GCN-NEXT: S_BRANCH %bb.1
79+
; GCN-NEXT: {{ $}}
80+
; GCN-NEXT: bb.1:
81+
; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
82+
; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_3]]
83+
; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_4]]
84+
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0
85+
; GCN-NEXT: {{ $}}
86+
; GCN-NEXT: bb.2:
87+
; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 2
88+
; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_5]]
89+
; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_6]]
90+
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0
91+
bb.0.entry:
92+
successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
93+
94+
%5:sreg_32 = IMPLICIT_DEF
95+
%6:vgpr_32 = COPY %5:sreg_32
96+
%4:vreg_64 = DS_READ2_B32_gfx9 killed %6:vgpr_32, 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3)
97+
%7:sgpr_32 = COPY %4.sub0:vreg_64
98+
%8:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed %7:sgpr_32, 0, 0, 0, implicit $mode, implicit $exec
99+
%9:sreg_32 = S_MOV_B32 65535
100+
%11:sreg_32 = COPY %8:vgpr_16
101+
%10:sreg_32 = S_AND_B32 killed %9:sreg_32, killed %11:sreg_32, implicit-def dead $scc
102+
%12:sreg_32 = S_MOV_B32 5
103+
%13:sreg_32 = S_MUL_I32 killed %10:sreg_32, killed %12:sreg_32
104+
%14:sreg_32 = S_MOV_B32 2
105+
S_CMP_LG_U32 killed %13:sreg_32, killed %14:sreg_32, implicit-def $scc
106+
S_CBRANCH_SCC1 %bb.2, implicit $scc
107+
S_BRANCH %bb.1
108+
bb.1:
109+
%17:sreg_32 = S_MOV_B32 1
110+
%18:sreg_32 = S_MOV_B32 killed %17:sreg_32
111+
$sgpr0 = COPY %18:sreg_32
112+
SI_RETURN_TO_EPILOG $sgpr0
113+
bb.2:
114+
%15:sreg_32 = S_MOV_B32 2
115+
%16:sreg_32 = S_MOV_B32 killed %15:sreg_32
116+
$sgpr0 = COPY %16:sreg_32
117+
SI_RETURN_TO_EPILOG $sgpr0
118+
...
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s
3+
4+
; expect readfirstlane to pick the 32bit register
5+
define amdgpu_gs i32 @vgpr16_copyto_sgpr(ptr addrspace(3) %a, i32 %b, ptr addrspace(1) %out) {
6+
; CHECK-LABEL: vgpr16_copyto_sgpr:
7+
; CHECK: ; %bb.0: ; %entry
8+
; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
9+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
10+
; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0
11+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
12+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
13+
; CHECK-NEXT: s_and_b32 s0, 0xffff, s0
14+
; CHECK-NEXT: s_mul_i32 s0, s0, 5
15+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
16+
; CHECK-NEXT: s_cmp_lg_u32 s0, 2
17+
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
18+
; CHECK-NEXT: ; %bb.1: ; %a1
19+
; CHECK-NEXT: s_mov_b32 s0, 1
20+
; CHECK-NEXT: s_branch .LBB0_3
21+
; CHECK-NEXT: .LBB0_2: ; %a2
22+
; CHECK-NEXT: s_mov_b32 s0, 2
23+
; CHECK-NEXT: s_branch .LBB0_3
24+
; CHECK-NEXT: .LBB0_3:
25+
entry:
26+
%1 = load <4 x float>, ptr addrspace(3) poison, align 4
27+
%2 = extractelement <4 x float> %1, i32 0
28+
%3 = fptrunc float %2 to half
29+
%4 = bitcast half %3 to i16
30+
%5 = zext i16 %4 to i32
31+
%6 = add i32 %5, 1
32+
%7 = mul i32 %6, 5
33+
%8 = icmp eq i32 %7, 7
34+
br i1 %8, label %a1, label %a2
35+
36+
a1:
37+
ret i32 1
38+
39+
a2:
40+
ret i32 2
41+
}

0 commit comments

Comments
 (0)