-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[X86][AVX] Match v4f64 blend from shuffle of scalar values. #135753
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Leon Clark (PeddleSpam) Changes
Full diff: https://github.com/llvm/llvm-project/pull/135753.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 84aaf86550842..382f089971537 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9040,6 +9040,39 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
unsigned NumElems = Op.getNumOperands();
+ // Match BUILD_VECTOR of scalars that we can lower to X86ISD::BLENDI via
+ // shuffles.
+ //
+ // v4f64 = BUILD_VECTOR X,Y,Y,X
+ // >>>
+ // t1: v4f64 = BUILD_VECTOR X,u,u,u
+ // t3: v4f64 = vector_shuffle<0,u,u,0> t1, u
+ // t2: v4f64 = BUILD_VECTOR Y,u,u,u
+ // t4: v4f64 = vector_shuffle<u,0,0,u> t2, u
+ // v4f64 = vector_shuffle<0,5,6,3> t3, t4
+ //
+ if (Subtarget.hasAVX() && VT == MVT::v4f64 && Op->getNumOperands() == 4u) {
+ auto Op0 = Op->getOperand(0u);
+ auto Op1 = Op->getOperand(1u);
+ auto Op2 = Op->getOperand(2u);
+ auto Op3 = Op->getOperand(3u);
+
+ // Match X,Y,Y,X inputs.
+ if (Op0 == Op3 && Op1 == Op2 && Op0 != Op1) {
+ auto PsnVal = DAG.getUNDEF(MVT::f64);
+
+ auto NewOp0 = DAG.getBuildVector(VT, dl, {Op0, PsnVal, PsnVal, PsnVal});
+ NewOp0 = DAG.getVectorShuffle(VT, dl, NewOp0, DAG.getUNDEF(VT),
+ {0, -1, -1, 0});
+
+ auto NewOp1 = DAG.getBuildVector(VT, dl, {Op1, PsnVal, PsnVal, PsnVal});
+ NewOp1 = DAG.getVectorShuffle(VT, dl, NewOp1, DAG.getUNDEF(VT),
+ {-1, 0, 0, -1});
+
+ return DAG.getVectorShuffle(VT, dl, NewOp0, NewOp1, {0, 5, 6, 3});
+ }
+ }
+
// Generate vectors for predicate vectors.
if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll
index 9f90657dc64d1..a1af29550f64f 100644
--- a/llvm/test/CodeGen/X86/shuffle-blendw.ll
+++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll
@@ -263,3 +263,389 @@ define <8 x i16> @blendw_to_blendd_fail_16(<8 x i16> %x, <8 x i16> %y, <8 x i16>
%shuffle = shufflevector <8 x i16> %x1, <8 x i16> %y, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
ret <8 x i16> %shuffle
}
+
+define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
+; X86-SSE41-LABEL: blend_broadcasts_v4f64:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT: movaps (%ecx), %xmm2
+; X86-SSE41-NEXT: movaps (%eax), %xmm1
+; X86-SSE41-NEXT: movaps %xmm2, %xmm0
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE41-NEXT: retl
+;
+; X64-SSE41-LABEL: blend_broadcasts_v4f64:
+; X64-SSE41: # %bb.0:
+; X64-SSE41-NEXT: movaps (%rdi), %xmm2
+; X64-SSE41-NEXT: movaps (%rsi), %xmm1
+; X64-SSE41-NEXT: movaps %xmm2, %xmm0
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-SSE41-NEXT: retq
+;
+; X86-AVX-LABEL: blend_broadcasts_v4f64:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; X86-AVX-NEXT: retl
+;
+; X64-AVX-LABEL: blend_broadcasts_v4f64:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0
+; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm1
+; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; X64-AVX-NEXT: retq
+;
+; X86-AVX2-LABEL: blend_broadcasts_v4f64:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; X86-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: blend_broadcasts_v4f64:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
+; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm1
+; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; X64-AVX2-NEXT: retq
+;
+; X86-AVX512-LABEL: blend_broadcasts_v4f64:
+; X86-AVX512: # %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; X86-AVX512-NEXT: retl
+;
+; X64-AVX512-LABEL: blend_broadcasts_v4f64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
+; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm1
+; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; X64-AVX512-NEXT: retq
+ %ld0 = load <4 x double>, ptr %p0, align 32
+ %ld1 = load <4 x double>, ptr %p1, align 32
+ %bcst0 = shufflevector <4 x double> %ld0, <4 x double> undef, <4 x i32> zeroinitializer
+ %bcst1 = shufflevector <4 x double> %ld1, <4 x double> undef, <4 x i32> zeroinitializer
+ %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v2f64(ptr %p0, ptr %p1) {
+; X86-SSE41-LABEL: blend_broadcasts_v2f64:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT: movaps (%ecx), %xmm2
+; X86-SSE41-NEXT: movaps (%eax), %xmm1
+; X86-SSE41-NEXT: movaps %xmm2, %xmm0
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE41-NEXT: retl
+;
+; X64-SSE41-LABEL: blend_broadcasts_v2f64:
+; X64-SSE41: # %bb.0:
+; X64-SSE41-NEXT: movaps (%rdi), %xmm2
+; X64-SSE41-NEXT: movaps (%rsi), %xmm1
+; X64-SSE41-NEXT: movaps %xmm2, %xmm0
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-SSE41-NEXT: retq
+;
+; X86-AVX-LABEL: blend_broadcasts_v2f64:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; X86-AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X86-AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; X86-AVX-NEXT: retl
+;
+; X64-AVX-LABEL: blend_broadcasts_v2f64:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; X64-AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64-AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; X64-AVX-NEXT: retq
+;
+; X86-AVX2-LABEL: blend_broadcasts_v2f64:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; X86-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X86-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; X86-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: blend_broadcasts_v2f64:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; X64-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; X64-AVX2-NEXT: retq
+;
+; X86-AVX512-LABEL: blend_broadcasts_v2f64:
+; X86-AVX512: # %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2]
+; X86-AVX512-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
+; X86-AVX512-NEXT: retl
+;
+; X64-AVX512-LABEL: blend_broadcasts_v2f64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2]
+; X64-AVX512-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
+; X64-AVX512-NEXT: retq
+ %ld0 = load <2 x double>, ptr %p0, align 32
+ %ld1 = load <2 x double>, ptr %p1, align 32
+ %blend = shufflevector <2 x double> %ld0, <2 x double> %ld1, <4 x i32> <i32 0, i32 2, i32 2, i32 0>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) {
+; X86-SSE41-LABEL: blend_broadcasts_v1f64:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE41-NEXT: movaps %xmm2, %xmm0
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE41-NEXT: retl
+;
+; X64-SSE41-LABEL: blend_broadcasts_v1f64:
+; X64-SSE41: # %bb.0:
+; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; X64-SSE41-NEXT: movaps %xmm2, %xmm0
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-SSE41-NEXT: retq
+;
+; X86-AVX-LABEL: blend_broadcasts_v1f64:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X86-AVX-NEXT: retl
+;
+; X64-AVX-LABEL: blend_broadcasts_v1f64:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X64-AVX-NEXT: retq
+;
+; X86-AVX2-LABEL: blend_broadcasts_v1f64:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X86-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: blend_broadcasts_v1f64:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X64-AVX2-NEXT: retq
+;
+; X86-AVX512-LABEL: blend_broadcasts_v1f64:
+; X86-AVX512: # %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X86-AVX512-NEXT: retl
+;
+; X64-AVX512-LABEL: blend_broadcasts_v1f64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X64-AVX512-NEXT: retq
+ %ld0 = load <1 x double>, ptr %p0, align 32
+ %ld1 = load <1 x double>, ptr %p1, align 32
+ %blend = shufflevector <1 x double> %ld0, <1 x double> %ld1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) {
+; X86-SSE41-LABEL: blend_broadcasts_v1f64_4x:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE41-NEXT: movaps %xmm2, %xmm0
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE41-NEXT: retl
+;
+; X64-SSE41-LABEL: blend_broadcasts_v1f64_4x:
+; X64-SSE41: # %bb.0:
+; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; X64-SSE41-NEXT: movaps %xmm2, %xmm0
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-SSE41-NEXT: retq
+;
+; X86-AVX-LABEL: blend_broadcasts_v1f64_4x:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X86-AVX-NEXT: retl
+;
+; X64-AVX-LABEL: blend_broadcasts_v1f64_4x:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X64-AVX-NEXT: retq
+;
+; X86-AVX2-LABEL: blend_broadcasts_v1f64_4x:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X86-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: blend_broadcasts_v1f64_4x:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X64-AVX2-NEXT: retq
+;
+; X86-AVX512-LABEL: blend_broadcasts_v1f64_4x:
+; X86-AVX512: # %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X86-AVX512-NEXT: retl
+;
+; X64-AVX512-LABEL: blend_broadcasts_v1f64_4x:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X64-AVX512-NEXT: retq
+ %ld0 = load <1 x double>, ptr %p0, align 32
+ %ld1 = load <1 x double>, ptr %p1, align 32
+ %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <4 x i32> zeroinitializer
+ %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <4 x i32> zeroinitializer
+ %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) {
+; X86-SSE41-LABEL: blend_broadcasts_v1f64_2x:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE41-NEXT: movaps %xmm2, %xmm0
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE41-NEXT: retl
+;
+; X64-SSE41-LABEL: blend_broadcasts_v1f64_2x:
+; X64-SSE41: # %bb.0:
+; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; X64-SSE41-NEXT: movaps %xmm2, %xmm0
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-SSE41-NEXT: retq
+;
+; X86-AVX-LABEL: blend_broadcasts_v1f64_2x:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X86-AVX-NEXT: retl
+;
+; X64-AVX-LABEL: blend_broadcasts_v1f64_2x:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X64-AVX-NEXT: retq
+;
+; X86-AVX2-LABEL: blend_broadcasts_v1f64_2x:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X86-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: blend_broadcasts_v1f64_2x:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X64-AVX2-NEXT: retq
+;
+; X86-AVX512-LABEL: blend_broadcasts_v1f64_2x:
+; X86-AVX512: # %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X86-AVX512-NEXT: retl
+;
+; X64-AVX512-LABEL: blend_broadcasts_v1f64_2x:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; X64-AVX512-NEXT: retq
+ %ld0 = load <1 x double>, ptr %p0, align 32
+ %ld1 = load <1 x double>, ptr %p1, align 32
+ %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <2 x i32> zeroinitializer
+ %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <2 x i32> zeroinitializer
+ %blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+ ret <4 x double> %blend
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for looking at this
Based off test coverage for #135753 - these should be lowered to BLEND(BROADCAST(X),BROADCAST(Y))
0e3e1e4
to
1016286
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A few style issues, plus hopefully a fix for the AVX1 regression
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, we can build on this pretty easily.
LGTM with one minor.
Thanks for the review 😊 |
Convert a BUILD_VECTOR of scalar values to a shuffle of shuffles that will lower to AVX blend.
10e66f3
to
02345ed
Compare
Based off test coverage for llvm#135753 - these should be lowered to BLEND(BROADCAST(X),BROADCAST(Y))
Based off test coverage for llvm#135753 - these should be lowered to BLEND(BROADCAST(X),BROADCAST(Y))
Based off test coverage for llvm#135753 - these should be lowered to BLEND(BROADCAST(X),BROADCAST(Y))
) Convert a BUILD_VECTOR of scalar values to a shuffle of shuffles that will lower to AVX blend. This addresses a regression in llvm#128938. --------- Co-authored-by: Leon Clark <[email protected]>
Convert a BUILD_VECTOR of scalar values to a shuffle of shuffles that will lower to AVX blend.
This addresses a regression in #128938.