@@ -6187,3 +6187,167 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace
61876187 store volatile i32 %after.offset , ptr addrspace (1 ) %out , align 4
61886188 ret void
61896189}
6190+
6191+ define amdgpu_kernel void @f16_arg (half %arg , ptr addrspace (1 ) %ptr ) {
6192+ ; SI-LABEL: f16_arg:
6193+ ; SI: ; %bb.0:
6194+ ; SI-NEXT: s_load_dword s6, s[4:5], 0x9
6195+ ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
6196+ ; SI-NEXT: s_mov_b32 s3, 0xf000
6197+ ; SI-NEXT: s_mov_b32 s2, -1
6198+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
6199+ ; SI-NEXT: v_mov_b32_e32 v0, s6
6200+ ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
6201+ ; SI-NEXT: s_endpgm
6202+ ;
6203+ ; VI-LABEL: f16_arg:
6204+ ; VI: ; %bb.0:
6205+ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
6206+ ; VI-NEXT: s_load_dword s2, s[4:5], 0x24
6207+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
6208+ ; VI-NEXT: v_mov_b32_e32 v0, s0
6209+ ; VI-NEXT: v_mov_b32_e32 v1, s1
6210+ ; VI-NEXT: v_mov_b32_e32 v2, s2
6211+ ; VI-NEXT: flat_store_short v[0:1], v2
6212+ ; VI-NEXT: s_endpgm
6213+ ;
6214+ ; GFX9-LABEL: f16_arg:
6215+ ; GFX9: ; %bb.0:
6216+ ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
6217+ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
6218+ ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6219+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6220+ ; GFX9-NEXT: v_mov_b32_e32 v1, s2
6221+ ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
6222+ ; GFX9-NEXT: s_endpgm
6223+ ;
6224+ ; EG-LABEL: f16_arg:
6225+ ; EG: ; %bb.0:
6226+ ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
6227+ ; EG-NEXT: TEX 0 @6
6228+ ; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
6229+ ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
6230+ ; EG-NEXT: CF_END
6231+ ; EG-NEXT: PAD
6232+ ; EG-NEXT: Fetch clause starting at 6:
6233+ ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3
6234+ ; EG-NEXT: ALU clause starting at 8:
6235+ ; EG-NEXT: MOV * T0.X, 0.0,
6236+ ; EG-NEXT: ALU clause starting at 9:
6237+ ; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x,
6238+ ; EG-NEXT: AND_INT * T1.W, T0.X, literal.y,
6239+ ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
6240+ ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
6241+ ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
6242+ ; EG-NEXT: LSHL T0.X, T1.W, PV.W,
6243+ ; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
6244+ ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
6245+ ; EG-NEXT: MOV T0.Y, 0.0,
6246+ ; EG-NEXT: MOV * T0.Z, 0.0,
6247+ ; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x,
6248+ ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6249+ ;
6250+ ; CM-LABEL: f16_arg:
6251+ ; CM: ; %bb.0:
6252+ ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
6253+ ; CM-NEXT: TEX 0 @6
6254+ ; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
6255+ ; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
6256+ ; CM-NEXT: CF_END
6257+ ; CM-NEXT: PAD
6258+ ; CM-NEXT: Fetch clause starting at 6:
6259+ ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3
6260+ ; CM-NEXT: ALU clause starting at 8:
6261+ ; CM-NEXT: MOV * T0.X, 0.0,
6262+ ; CM-NEXT: ALU clause starting at 9:
6263+ ; CM-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x,
6264+ ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
6265+ ; CM-NEXT: AND_INT T0.Z, T0.X, literal.x,
6266+ ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
6267+ ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
6268+ ; CM-NEXT: LSHL T0.X, PV.Z, PV.W,
6269+ ; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
6270+ ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
6271+ ; CM-NEXT: MOV T0.Y, 0.0,
6272+ ; CM-NEXT: MOV * T0.Z, 0.0,
6273+ ; CM-NEXT: LSHR * T1.X, KC0[2].Z, literal.x,
6274+ ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6275+ store half %arg , ptr addrspace (1 ) %ptr
6276+ ret void
6277+ }
6278+
6279+ define amdgpu_kernel void @v2f16_arg (<2 x half > %arg , ptr addrspace (1 ) %ptr ) {
6280+ ; SI-LABEL: v2f16_arg:
6281+ ; SI: ; %bb.0:
6282+ ; SI-NEXT: s_load_dword s6, s[4:5], 0x9
6283+ ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
6284+ ; SI-NEXT: s_mov_b32 s3, 0xf000
6285+ ; SI-NEXT: s_mov_b32 s2, -1
6286+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
6287+ ; SI-NEXT: v_mov_b32_e32 v0, s6
6288+ ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
6289+ ; SI-NEXT: s_endpgm
6290+ ;
6291+ ; VI-LABEL: v2f16_arg:
6292+ ; VI: ; %bb.0:
6293+ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
6294+ ; VI-NEXT: s_load_dword s2, s[4:5], 0x24
6295+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
6296+ ; VI-NEXT: v_mov_b32_e32 v0, s0
6297+ ; VI-NEXT: v_mov_b32_e32 v1, s1
6298+ ; VI-NEXT: v_mov_b32_e32 v2, s2
6299+ ; VI-NEXT: flat_store_dword v[0:1], v2
6300+ ; VI-NEXT: s_endpgm
6301+ ;
6302+ ; GFX9-LABEL: v2f16_arg:
6303+ ; GFX9: ; %bb.0:
6304+ ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
6305+ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
6306+ ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6307+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6308+ ; GFX9-NEXT: v_mov_b32_e32 v1, s2
6309+ ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
6310+ ; GFX9-NEXT: s_endpgm
6311+ ;
6312+ ; EG-LABEL: v2f16_arg:
6313+ ; EG: ; %bb.0:
6314+ ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
6315+ ; EG-NEXT: TEX 1 @6
6316+ ; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
6317+ ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
6318+ ; EG-NEXT: CF_END
6319+ ; EG-NEXT: PAD
6320+ ; EG-NEXT: Fetch clause starting at 6:
6321+ ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 38, #3
6322+ ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3
6323+ ; EG-NEXT: ALU clause starting at 10:
6324+ ; EG-NEXT: MOV * T0.X, 0.0,
6325+ ; EG-NEXT: ALU clause starting at 11:
6326+ ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
6327+ ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
6328+ ; EG-NEXT: OR_INT T0.X, T0.X, PV.W,
6329+ ; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x,
6330+ ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6331+ ;
6332+ ; CM-LABEL: v2f16_arg:
6333+ ; CM: ; %bb.0:
6334+ ; CM-NEXT: ALU 0, @10, KC0[], KC1[]
6335+ ; CM-NEXT: TEX 1 @6
6336+ ; CM-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
6337+ ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
6338+ ; CM-NEXT: CF_END
6339+ ; CM-NEXT: PAD
6340+ ; CM-NEXT: Fetch clause starting at 6:
6341+ ; CM-NEXT: VTX_READ_16 T1.X, T0.X, 38, #3
6342+ ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3
6343+ ; CM-NEXT: ALU clause starting at 10:
6344+ ; CM-NEXT: MOV * T0.X, 0.0,
6345+ ; CM-NEXT: ALU clause starting at 11:
6346+ ; CM-NEXT: LSHL * T0.W, T1.X, literal.x,
6347+ ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
6348+ ; CM-NEXT: OR_INT * T0.X, T0.X, PV.W,
6349+ ; CM-NEXT: LSHR * T1.X, KC0[2].Z, literal.x,
6350+ ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6351+ store <2 x half > %arg , ptr addrspace (1 ) %ptr
6352+ ret void
6353+ }
0 commit comments