Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0d37e79

Browse files
RKSimondyung
authored andcommitted
[X86] lowerV4F32Shuffle - don't use INSERTPS if SHUFPS will suffice (#186468)
If we have 2 or more undef/undemanded elements, the INSERTPS replaces those with explicit zero'd elements which can cause infinite loops later on in shuffle combining depending on whether we demand those elements or not. I'll try to improve the (minor) v2f32 regressions in a follow up, but I need to fix the infinite loop first. Fixes #186403
1 parent 9b72202 commit 0d37e79

13 files changed

Lines changed: 239 additions & 237 deletions

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13915,19 +13915,18 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1391513915
// when the V2 input is targeting element 0 of the mask -- that is the fast
1391613916
// case here.
1391713917
if (NumV2Elements == 1 && Mask[0] >= 4)
13918-
if (SDValue V = lowerShuffleAsElementInsertion(
13919-
DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13918+
if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask,
13919+
Zeroable, Subtarget, DAG))
1392013920
return V;
1392113921

13922-
if (Subtarget.hasSSE41()) {
13922+
if (Subtarget.hasSSE41() && !isSingleSHUFPSMask(Mask)) {
1392313923
// Use INSERTPS if we can complete the shuffle efficiently.
1392413924
if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
1392513925
return V;
1392613926

13927-
if (!isSingleSHUFPSMask(Mask))
13928-
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13929-
V2, Mask, DAG))
13930-
return BlendPerm;
13927+
if (SDValue BlendPerm =
13928+
lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG))
13929+
return BlendPerm;
1393113930
}
1393213931

1393313932
// Use low/high mov instructions. These are only valid in SSE1 because

llvm/test/CodeGen/X86/avx2-masked-gather.ll

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ declare <2 x i32> @llvm.masked.gather.v2i32(<2 x ptr> %ptrs, i32 %align, <2 x i1
99
define <2 x i32> @masked_gather_v2i32(ptr %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
1010
; X86-LABEL: masked_gather_v2i32:
1111
; X86: # %bb.0: # %entry
12-
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
12+
; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
13+
; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,3]
1314
; X86-NEXT: vpslld $31, %xmm0, %xmm0
1415
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1516
; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
@@ -58,7 +59,8 @@ entry:
5859
define <4 x i32> @masked_gather_v2i32_concat(ptr %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
5960
; X86-LABEL: masked_gather_v2i32_concat:
6061
; X86: # %bb.0: # %entry
61-
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
62+
; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
63+
; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,3]
6264
; X86-NEXT: vpslld $31, %xmm0, %xmm0
6365
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6466
; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
@@ -110,7 +112,8 @@ declare <2 x float> @llvm.masked.gather.v2float(<2 x ptr> %ptrs, i32 %align, <2
110112
define <2 x float> @masked_gather_v2float(ptr %ptr, <2 x i1> %masks, <2 x float> %passthro) {
111113
; X86-LABEL: masked_gather_v2float:
112114
; X86: # %bb.0: # %entry
113-
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
115+
; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
116+
; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,3]
114117
; X86-NEXT: vpslld $31, %xmm0, %xmm0
115118
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
116119
; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
@@ -160,7 +163,8 @@ entry:
160163
define <4 x float> @masked_gather_v2float_concat(ptr %ptr, <2 x i1> %masks, <2 x float> %passthro) {
161164
; X86-LABEL: masked_gather_v2float_concat:
162165
; X86: # %bb.0: # %entry
163-
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
166+
; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
167+
; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,3]
164168
; X86-NEXT: vpslld $31, %xmm0, %xmm0
165169
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
166170
; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero

llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,8 @@ define <2 x i32> @test_gather_v2i32_data_index(ptr %base, <2 x i32> %ind, <2 x i
353353
;
354354
; WIDEN_AVX2-LABEL: test_gather_v2i32_data_index:
355355
; WIDEN_AVX2: # %bb.0:
356-
; WIDEN_AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
356+
; WIDEN_AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
357+
; WIDEN_AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[2,3]
357358
; WIDEN_AVX2-NEXT: vpslld $31, %xmm1, %xmm1
358359
; WIDEN_AVX2-NEXT: vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2
359360
; WIDEN_AVX2-NEXT: vmovdqa %xmm2, %xmm0

llvm/test/CodeGen/X86/masked_store_trunc.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1808,9 +1808,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
18081808
; AVX1: # %bb.0:
18091809
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
18101810
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
1811-
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1812-
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
1813-
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
1811+
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1812+
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
1813+
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
18141814
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
18151815
; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
18161816
; AVX1-NEXT: retq
@@ -1819,9 +1819,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
18191819
; AVX2: # %bb.0:
18201820
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
18211821
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
1822-
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1823-
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
1824-
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
1822+
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1823+
; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1
1824+
; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
18251825
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
18261826
; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi)
18271827
; AVX2-NEXT: retq

llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2603,9 +2603,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
26032603
; AVX1: # %bb.0:
26042604
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
26052605
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
2606-
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2607-
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
2608-
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
2606+
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
2607+
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
2608+
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
26092609
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [2147483647,2147483647]
26102610
; AVX1-NEXT: # xmm2 = mem[0,0]
26112611
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
@@ -2622,9 +2622,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
26222622
; AVX2: # %bb.0:
26232623
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
26242624
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
2625-
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2626-
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
2627-
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
2625+
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
2626+
; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1
2627+
; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
26282628
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2147483647,2147483647]
26292629
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
26302630
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0

llvm/test/CodeGen/X86/masked_store_trunc_usat.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2278,9 +2278,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
22782278
; AVX1: # %bb.0:
22792279
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
22802280
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
2281-
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2282-
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
2283-
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
2281+
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
2282+
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
2283+
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
22842284
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
22852285
; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
22862286
; AVX1-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -2292,9 +2292,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
22922292
; AVX2: # %bb.0:
22932293
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
22942294
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
2295-
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2296-
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
2297-
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
2295+
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
2296+
; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1
2297+
; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
22982298
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
22992299
; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
23002300
; AVX2-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0

0 commit comments

Comments
 (0)