Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 51df921

Browse files
bjopec-rhodes
authored andcommitted
[SelectionDAG] Make sure demanded lanes for AND/MUL-by-zero are frozen (llvm#180727)
DAGCombiner can fold a chain of INSERT_VECTOR_ELT into a vector AND/OR operation. This patch adds protection to avoid that we end up making the vector more poisonous by freezing the source vector when the elements that should be set to 0/-1 may be poison in the source vector. The patch also fixes a bug in SimplifyDemandedVectorElts for MUL/MULHU/MULHS/AND that could result in making the vector more poisonous. Problem was that we skipped demanding elements from Op0 that were known to be zero in Op1. But that could result in elements being simplified into poison when simplifying Op0, and then the result would be poison and not zero after the MUL/MULHU/MULHS/AND. The solution is to defensively make sure that we demand all the elements originally demanded also when simplifying Op0. This bugs were found when analysing the miscompiles in llvm#179448 Main culprit in llvm#179448 seems to have been the bug in DAGCombiner. The bug in SimplifyDemandedVectorElts surfaced when fixing the DAGCombiner, as that fix typically introduce the (AND (FREEZE x), y) pattern that wasn't handled correctly in SimplifyDemandedVectorElts. Also fixes llvm#180409. Also fixes llvm#176682. (cherry picked from commit 6420099)
1 parent bc0a819 commit 51df921

16 files changed

Lines changed: 151 additions & 54 deletions

llvm/include/llvm/CodeGen/SelectionDAG.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1723,6 +1723,11 @@ class SelectionDAG {
17231723
/// Return a freeze using the SDLoc of the value operand.
17241724
LLVM_ABI SDValue getFreeze(SDValue V);
17251725

1726+
/// Return a freeze of V if any of the demanded elts may be undef or poison.
1727+
/// If \p PoisonOnly is true, then only check for poison elements.
1728+
LLVM_ABI SDValue getFreeze(SDValue V, const APInt &DemandedElts,
1729+
bool PoisonOnly = false);
1730+
17261731
/// Return an AssertAlignSDNode.
17271732
LLVM_ABI SDValue getAssertAlign(const SDLoc &DL, SDValue V, Align A);
17281733

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24060,8 +24060,17 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
2406024060
// Build the mask and return the corresponding DAG node.
2406124061
auto BuildMaskAndNode = [&](SDValue TrueVal, SDValue FalseVal,
2406224062
unsigned MaskOpcode) {
24063-
for (unsigned I = 0; I != NumElts; ++I)
24063+
APInt InsertedEltMask = APInt::getZero(NumElts);
24064+
for (unsigned I = 0; I != NumElts; ++I) {
2406424065
Mask[I] = Ops[I] ? TrueVal : FalseVal;
24066+
if (Ops[I])
24067+
InsertedEltMask.setBit(I);
24068+
}
24069+
// Make sure to freeze the source vector in case any of the elements
24070+
// overwritten by the insert may be poison. Otherwise those elements
24071+
// could end up being poison instead of 0/-1 after the AND/OR.
24072+
CurVec =
24073+
DAG.getFreeze(CurVec, InsertedEltMask, /*PoisonOnly=*/true);
2406524074
return DAG.getNode(MaskOpcode, DL, VT, CurVec,
2406624075
DAG.getBuildVector(VT, DL, Mask));
2406724076
};

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2502,6 +2502,13 @@ SDValue SelectionDAG::getFreeze(SDValue V) {
25022502
return getNode(ISD::FREEZE, SDLoc(V), V.getValueType(), V);
25032503
}
25042504

2505+
SDValue SelectionDAG::getFreeze(SDValue V, const APInt &DemandedElts,
2506+
bool PoisonOnly) {
2507+
if (isGuaranteedNotToBeUndefOrPoison(V, DemandedElts, PoisonOnly))
2508+
return V;
2509+
return getFreeze(V);
2510+
}
2511+
25052512
/// getShiftAmountOperand - Return the specified value casted to
25062513
/// the target's desired shift amount type.
25072514
SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3794,19 +3794,22 @@ bool TargetLowering::SimplifyDemandedVectorElts(
37943794
if (SimplifyDemandedVectorElts(Op1, DemandedElts, SrcUndef, SrcZero, TLO,
37953795
Depth + 1))
37963796
return true;
3797-
// If we know that a demanded element was zero in Op1 we don't need to
3798-
// demand it in Op0 - its guaranteed to be zero.
3799-
APInt DemandedElts0 = DemandedElts & ~SrcZero;
3800-
if (SimplifyDemandedVectorElts(Op0, DemandedElts0, KnownUndef, KnownZero,
3797+
// FIXME: If we know that a demanded element was zero in Op1 we don't need
3798+
// to demand it in Op0 - its guaranteed to be zero. There is however a
3799+
// restriction, as we must not make any of the originally demanded elements
3800+
// more poisonous. We could reduce amount of elements demanded, but then we
3801+
// also need a to inform SimplifyDemandedVectorElts that some elements must
3802+
// not be made more poisonous.
3803+
if (SimplifyDemandedVectorElts(Op0, DemandedElts, KnownUndef, KnownZero,
38013804
TLO, Depth + 1))
38023805
return true;
38033806

3804-
KnownUndef &= DemandedElts0;
3805-
KnownZero &= DemandedElts0;
3807+
KnownUndef &= DemandedElts;
3808+
KnownZero &= DemandedElts;
38063809

3807-
// If every element pair has a zero/undef then just fold to zero.
3808-
// fold (and x, undef) -> 0 / (and x, 0) -> 0
3809-
// fold (mul x, undef) -> 0 / (mul x, 0) -> 0
3810+
// If every element pair has a zero/undef/poison then just fold to zero.
3811+
// fold (and x, undef/poison) -> 0 / (and x, 0) -> 0
3812+
// fold (mul x, undef/poison) -> 0 / (mul x, 0) -> 0
38103813
if (DemandedElts.isSubsetOf(SrcZero | KnownZero | SrcUndef | KnownUndef))
38113814
return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
38123815

llvm/test/CodeGen/AArch64/neon-dotreduce.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -445,8 +445,9 @@ entry:
445445
define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) {
446446
; CHECK-SD-LABEL: test_udot_v5i8_nomla:
447447
; CHECK-SD: // %bb.0: // %entry
448-
; CHECK-SD-NEXT: ldr d0, [x0]
448+
; CHECK-SD-NEXT: ldr x8, [x0]
449449
; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
450+
; CHECK-SD-NEXT: fmov d0, x8
450451
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
451452
; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0
452453
; CHECK-SD-NEXT: mov v1.s[0], v2.s[0]
@@ -2681,8 +2682,8 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) {
26812682
; CHECK-SD-NEXT: ldp q2, q1, [x0]
26822683
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
26832684
; CHECK-SD-NEXT: ushll2 v3.8h, v1.16b, #0
2684-
; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
26852685
; CHECK-SD-NEXT: ushll v4.8h, v2.8b, #0
2686+
; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
26862687
; CHECK-SD-NEXT: ushll2 v2.8h, v2.16b, #0
26872688
; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0
26882689
; CHECK-SD-NEXT: uaddl2 v5.4s, v4.8h, v1.8h

llvm/test/CodeGen/X86/insertelement-zero.ll

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,3 +539,43 @@ define <4 x i32> @PR41512_loads(ptr %p1, ptr %p2) {
539539
%r = shufflevector <4 x i32> %ins1, <4 x i32> %ins2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
540540
ret <4 x i32> %r
541541
}
542+
543+
; Reproducer for bugs in DAGCombiner and SimplifyDemandedVectorElts.
544+
;
545+
; Problem was that DAGCombiner replaced INSERT_VECTOR_ELT by AND, without
546+
; considering that %i has poison elements. So instead of overwriting those
547+
; poison elements by inserting zeroes, we got "AND poison, 0" which is poison
548+
; and not guaranteed to be folded as zero.
549+
;
550+
; When solving the above by inserting a FREEZE another bug
551+
; surfaced. SimplifyDemandedVectorElts was not demanding elements that were
552+
; known to be AND:ed by zero. So the FREEZE ended up being removed and we
553+
; still got "AND poison, 0".
554+
;
555+
; Expected result is that the add reduction computes the sum 0+0+0+0+0+77+0+77 = 154.
556+
define i64 @fold_insertelement_to_and(i32 noundef %arg) {
557+
; SSE-LABEL: fold_insertelement_to_and:
558+
; SSE: # %bb.0:
559+
; SSE-NEXT: movl $154, %eax
560+
; SSE-NEXT: retq
561+
;
562+
; AVX1-LABEL: fold_insertelement_to_and:
563+
; AVX1: # %bb.0:
564+
; AVX1-NEXT: movl $154, %eax
565+
; AVX1-NEXT: retq
566+
;
567+
; AVX2-LABEL: fold_insertelement_to_and:
568+
; AVX2: # %bb.0:
569+
; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,77]
570+
; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm1
571+
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
572+
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
573+
; AVX2-NEXT: vmovq %xmm0, %rax
574+
; AVX2-NEXT: retq
575+
%i = shufflevector <8 x i64> zeroinitializer, <8 x i64> splat (i64 77), <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 4, i32 8, i32 6, i32 10>
576+
%i1 = insertelement <8 x i64> %i, i64 0, i64 0
577+
%i2 = insertelement <8 x i64> %i1, i64 0, i64 2
578+
%i3 = shufflevector <8 x i64> %i2, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 5, i32 6, i32 7>
579+
%i4 = tail call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %i3)
580+
ret i64 %i4
581+
}

llvm/test/CodeGen/X86/pr134602.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ define i32 @PR134602(i16 %a0) {
1717
; X64-NEXT: movzwl %di, %eax
1818
; X64-NEXT: movd %eax, %xmm0
1919
; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
20-
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
20+
; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7]
2121
; X64-NEXT: paddw %xmm0, %xmm1
2222
; X64-NEXT: movdqa %xmm1, %xmm0
2323
; X64-NEXT: psrld $16, %xmm0

llvm/test/CodeGen/X86/pr173924.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ define i256 @PR173924(<8 x i256> %a0) {
77
; CHECK: # %bb.0:
88
; CHECK-NEXT: movq %rdi, %rax
99
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edi
10-
; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
10+
; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %xmm0
1111
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx
1212
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
1313
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r8d

llvm/test/CodeGen/X86/vector-fshl-256.ll

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,6 +1004,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
10041004
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
10051005
; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
10061006
; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1007+
; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
10071008
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
10081009
; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
10091010
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
@@ -1015,6 +1016,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
10151016
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
10161017
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
10171018
; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1019+
; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
10181020
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
10191021
; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0
10201022
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
@@ -1026,6 +1028,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
10261028
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
10271029
; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
10281030
; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1031+
; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
10291032
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
10301033
; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0
10311034
; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
@@ -1037,6 +1040,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
10371040
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
10381041
; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
10391042
; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1043+
; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
10401044
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
10411045
; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
10421046
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
@@ -1057,6 +1061,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
10571061
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
10581062
; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
10591063
; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1064+
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
10601065
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
10611066
; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
10621067
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
@@ -1092,6 +1097,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
10921097
; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
10931098
; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
10941099
; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1100+
; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2
10951101
; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
10961102
; XOPAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
10971103
; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0

llvm/test/CodeGen/X86/vector-fshl-512.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
552552
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
553553
; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
554554
; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
555+
; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
555556
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
556557
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
557558
; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3
@@ -570,6 +571,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
570571
; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
571572
; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
572573
; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
574+
; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
573575
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
574576
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
575577
; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3
@@ -584,6 +586,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
584586
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
585587
; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
586588
; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
589+
; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
587590
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
588591
; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
589592
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
@@ -601,6 +604,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
601604
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
602605
; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1
603606
; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
607+
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
604608
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
605609
; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
606610
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0

0 commit comments

Comments
 (0)