[X86][DAGCombiner][SelectionDAG] - Fold Zext Build Vector to Bitcast of widen Build Vector #135010

rohitaggarwal007 · 2025-04-09T13:47:09Z

I am working on a problem in which a kernel is SLP vectorized and lead to generation of insertelements followed by zext. On lowering, the assembly looks like below:

vmovd   %r9d, %xmm0
vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0
vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0
vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0
vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0
vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0
vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0
vpmovzxbw       %xmm0, %xmm0            # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
vpmaddwd        (%rdx), %xmm0, %xmm0

After all the insrb, xmm0 looks like
xmm0=xmm0[0],xmm0[1],xmm0[2],xmm0[3],xmm0[4],xmm0[5],xmm0[6],xmm0[7],zero,zero,zero,zero,zero,zero,zero,zero
Here vpmovzxbw perform the extension of i8 to i16. But it is expensive operation and I want to remove it.

Optimization
Place the value in correct location while inserting so that zext can be avoid.
While lowering, we can write a custom lowerOperation for zero_extend_vector_inreg opcode. We can override the current default operation with my custom in the legalization step.

The changes proposed are state below:

vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0
vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0
vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0
vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0
vpinsrb $a, (%rdi,%rax), %xmm0, %xmm0
vpinsrb $c, (%rdi,%rcx,2), %xmm0, %xmm0
vpinsrb $e, (%rdi,%r8), %xmm0, %xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero            
vpmaddwd        (%rdx), %xmm0, %xmm0

More details in the discourse topic https://discourse.llvm.org/t/improve-the-gathering-of-the-elements-so-that-unwanted-ext-operations-can-be-avoided/85443

…of widen Build Vector

llvmbot · 2025-04-09T13:47:58Z

@llvm/pr-subscribers-backend-webassembly
@llvm/pr-subscribers-backend-powerpc
@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-x86

Author: Rohit Aggarwal (rohitaggarwal007)

Changes

I am working on a problem in which a kernel is SLP vectorized and lead to generation of insertelements followed by zext. On lowering, the assembly looks like below:

vmovd   %r9d, %xmm0
vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0
vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0
vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0
vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0
vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0
vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0
vpmovzxbw       %xmm0, %xmm0            # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
vpmaddwd        (%rdx), %xmm0, %xmm0

After all the insrb, xmm0 looks like
xmm0=xmm0[0],xmm0[1],xmm0[2],xmm0[3],xmm0[4],xmm0[5],xmm0[6],xmm0[7],zero,zero,zero,zero,zero,zero,zero,zero
Here vpmovzxbw perform the extension of i8 to i16. But it is expensive operation and I want to remove it.

Optimization
Place the value in correct location while inserting so that zext can be avoid.
While lowering, we can write a custom lowerOperation for zero_extend_vector_inreg opcode. We can override the current default operation with my custom in the legalization step.

The changes proposed are state below:

vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0
vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0
vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0
vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0
vpinsrb $a, (%rdi,%rax), %xmm0, %xmm0
vpinsrb $c, (%rdi,%rcx,2), %xmm0, %xmm0
vpinsrb $e, (%rdi,%r8), %xmm0, %xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero            
vpmaddwd        (%rdx), %xmm0, %xmm0

More details in the discourse topic https://discourse.llvm.org/t/improve-the-gathering-of-the-elements-so-that-unwanted-ext-operations-can-be-avoided/85443

Full diff: https://github.com/llvm/llvm-project/pull/135010.diff

2 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+58)
(added) llvm/test/CodeGen/X86/WidenBuildVector.ll (+258)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 38376de5783ae..77c659aad0ed2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14195,6 +14195,61 @@ static SDValue widenAbs(SDNode *Extend, SelectionDAG &DAG) {
   return DAG.getZExtOrTrunc(NewAbs, SDLoc(Extend), VT);
 }
 
+// Try to widen the build vector and bitcast it to the type of zext.
+// This is a special case for the 128-bit vector types. Intention is to remove
+// the zext and replace it with a bitcast the wider type. While lowering
+// the bitcast is removed and extra commutation due to zext is avoided.
+static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {
+
+  assert(Extend->getOpcode() == ISD::ZERO_EXTEND && "Expected zero extend.");
+
+  EVT ExtendVT = Extend->getValueType(0);
+
+  SDValue BV = Extend->getOperand(0);
+  if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
+    return SDValue();
+
+  SDLoc dl(BV);
+  EVT VT = BV.getValueType();
+  EVT EltVT = BV.getOperand(0).getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
+  assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
+  // Fill the new elements with Zero.
+  NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
+  // Compute the step to place the elements in the right place and control the
+  // iteration.
+  unsigned step = WidenNumElts / NumElts;
+  if (WidenVT.is128BitVector()) {
+    if (Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
+      for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
+           i--, j -= step) {
+        SDValue temp = NewOps[i];
+        NewOps[i] = NewOps[j];
+        NewOps[j] = temp;
+      }
+      // Create new build vector with WidenVT and NewOps
+      SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
+      // Replace the old build vector with the new one. Bitcast the
+      // new build vector to the type of the zext.
+      SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
+      LLVM_DEBUG(
+          dbgs() << DAG.getMachineFunction().getFunction().getName()
+                 << " - Widening buildvector and replace zext with bitcast\n";
+          BV.dump(); Extend->dump(); dbgs() << " to \n";
+          NewBV.getNode()->dump(); NewBVBitcast->dump(););
+      return NewBV;
+    }
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -14521,6 +14576,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       return SDValue(CSENode, 0);
   }
 
+  if (SDValue V = widenBuildVec(N, DAG))
+    return V;
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/X86/WidenBuildVector.ll b/llvm/test/CodeGen/X86/WidenBuildVector.ll
new file mode 100644
index 0000000000000..d2924d016a1bf
--- /dev/null
+++ b/llvm/test/CodeGen/X86/WidenBuildVector.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mcpu=znver5 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i32 @foov8i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    leaq (%rsi,%rsi,2), %rcx
+; CHECK-NEXT:    leaq (%rsi,%rsi,4), %r8
+; CHECK-NEXT:    leaq (,%rsi,8), %r9
+; CHECK-NEXT:    subq %rsi, %r9
+; CHECK-NEXT:    vmovd %eax, %xmm0
+; CHECK-NEXT:    vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrb $10, (%rdi,%r8), %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrb $12, (%rdi,%rcx,2), %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrb $14, (%rdi,%r9), %xmm0, %xmm0
+; CHECK-NEXT:    vpmaddwd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    retq
+entry:
+  %var0 = load i8, ptr %a, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+  %var1 = load i8, ptr %arrayidx.1, align 1
+  %mul.2 = shl nsw i64 %a_stride, 1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2
+  %var2 = load i8, ptr %arrayidx.2, align 1
+  %mul.3 = mul nsw i64 %a_stride, 3
+  %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3
+  %var3 = load i8, ptr %arrayidx.3, align 1
+  %mul.4 = shl nsw i64 %a_stride, 2
+  %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 %mul.4
+  %var4 = load i8, ptr %arrayidx.4, align 1
+  %mul.5 = mul nsw i64 %a_stride, 5
+  %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 %mul.5
+  %var5 = load i8, ptr %arrayidx.5, align 1
+  %mul.6 = mul nsw i64 %a_stride, 6
+  %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 %mul.6
+  %var6 = load i8, ptr %arrayidx.6, align 1
+  %mul.7 = mul nsw i64 %a_stride, 7
+  %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 %mul.7
+  %var7 = load i8, ptr %arrayidx.7, align 1
+  %var8 = insertelement <8 x i8> poison, i8 %var0, i64 0
+  %var9 = insertelement <8 x i8> %var8, i8 %var1, i64 1
+  %var10 = insertelement <8 x i8> %var9, i8 %var2, i64 2
+  %var11 = insertelement <8 x i8> %var10, i8 %var3, i64 3
+  %var12 = insertelement <8 x i8> %var11, i8 %var4, i64 4
+  %var13 = insertelement <8 x i8> %var12, i8 %var5, i64 5
+  %var14 = insertelement <8 x i8> %var13, i8 %var6, i64 6
+  %var15 = insertelement <8 x i8> %var14, i8 %var7, i64 7
+  %var16 = zext <8 x i8> %var15 to <8 x i32>
+  %var17 = load <8 x i16>, ptr %b, align 2
+  %var18 = sext <8 x i16> %var17 to <8 x i32>
+  %var19 = mul nsw <8 x i32> %var18, %var16
+  %var20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %var19)
+  ret i32 %var20
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i32 @foov4i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    leaq (%rsi,%rsi,2), %rcx
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT:    vmovd %eax, %xmm0
+; CHECK-NEXT:    vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrb $8, (%rdi,%rsi,2), %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrb $12, (%rdi,%rcx), %xmm0, %xmm0
+; CHECK-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    retq
+entry:
+  %var0 = load i8, ptr %a, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+  %var1 = load i8, ptr %arrayidx.1, align 1
+  %mul.2 = shl nsw i64 %a_stride, 1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2
+  %var2 = load i8, ptr %arrayidx.2, align 1
+  %mul.3 = mul nsw i64 %a_stride, 3
+  %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3
+  %var3 = load i8, ptr %arrayidx.3, align 1
+  %var8 = insertelement <4 x i8> poison, i8 %var0, i64 0
+  %var9 = insertelement <4 x i8> %var8, i8 %var1, i64 1
+  %var10 = insertelement <4 x i8> %var9, i8 %var2, i64 2
+  %var11 = insertelement <4 x i8> %var10, i8 %var3, i64 3
+  %var16 = zext <4 x i8> %var11 to <4 x i32>
+  %var17 = load <4 x i16>, ptr %b, align 2
+  %var18 = sext <4 x i16> %var17 to <4 x i32>
+  %var19 = mul nsw <4 x i32> %var18, %var16
+  %var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19)
+  ret i32 %var20
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i32 @foov2i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovd %eax, %xmm0
+; CHECK-NEXT:    vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; CHECK-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    retq
+  %var0 = load i8, ptr %a, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+  %var1 = load i8, ptr %arrayidx.1, align 1
+  %var8 = insertelement <2 x i8> poison, i8 %var0, i64 0
+  %var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1
+  %var16 = zext <2 x i8> %var9 to <2 x i32>
+  %var17 = load <2 x i16>, ptr %b, align 2
+  %var18 = sext <2 x i16> %var17 to <2 x i32>
+  %var19 = mul nsw <2 x i32> %var18, %var16
+  %var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19)
+  ret i32 %var20
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i64 @foov2i8_v2i64(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov2i8_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    vpmovsxbq (%rdx), %xmm1
+; CHECK-NEXT:    vmovd %eax, %xmm0
+; CHECK-NEXT:    vpinsrb $8, (%rdi,%rsi), %xmm0, %xmm0
+; CHECK-NEXT:    vpmuldq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovq %xmm0, %rax
+; CHECK-NEXT:    retq
+  %var0 = load i8, ptr %a, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+  %var1 = load i8, ptr %arrayidx.1, align 1
+  %var8 = insertelement <2 x i8> poison, i8 %var0, i64 0
+  %var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1
+  %var16 = zext <2 x i8> %var9 to <2 x i64>
+  %var17 = load <2 x i8>, ptr %b, align 2
+  %var18 = sext <2 x i8> %var17 to <2 x i64>
+  %var19 = mul nsw <2 x i64> %var18, %var16
+  %var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19)
+  ret i64 %var20
+}
+
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i32 @foov4i16(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    leaq (%rsi,%rsi,2), %rcx
+; CHECK-NEXT:    vpmovsxwd (%rdx), %xmm1
+; CHECK-NEXT:    vmovd %eax, %xmm0
+; CHECK-NEXT:    vpinsrw $2, (%rdi,%rsi), %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $4, (%rdi,%rsi,2), %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $6, (%rdi,%rcx), %xmm0, %xmm0
+; CHECK-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    retq
+entry:
+  %var0 = load i16, ptr %a, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+  %var1 = load i16, ptr %arrayidx.1, align 1
+  %mul.2 = shl nsw i64 %a_stride, 1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2
+  %var2 = load i16, ptr %arrayidx.2, align 1
+  %mul.3 = mul nsw i64 %a_stride, 3
+  %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3
+  %var3 = load i16, ptr %arrayidx.3, align 1
+  %var8 = insertelement <4 x i16> poison, i16 %var0, i64 0
+  %var9 = insertelement <4 x i16> %var8, i16 %var1, i64 1
+  %var10 = insertelement <4 x i16> %var9, i16 %var2, i64 2
+  %var11 = insertelement <4 x i16> %var10, i16 %var3, i64 3
+  %var16 = zext <4 x i16> %var11 to <4 x i32>
+  %var17 = load <4 x i16>, ptr %b, align 2
+  %var18 = sext <4 x i16> %var17 to <4 x i32>
+  %var19 = mul nsw <4 x i32> %var18, %var16
+  %var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19)
+  ret i32 %var20
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i32 @foov2i16(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovd %eax, %xmm0
+; CHECK-NEXT:    vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0
+; CHECK-NEXT:    vpmovsxwd %xmm1, %xmm1
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    retq
+  %var0 = load i16, ptr %a, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+  %var1 = load i16, ptr %arrayidx.1, align 1
+  %var8 = insertelement <2 x i16> poison, i16 %var0, i64 0
+  %var9 = insertelement <2 x i16> %var8, i16 %var1, i64 1
+  %var16 = zext <2 x i16> %var9 to <2 x i32>
+  %var17 = load <2 x i16>, ptr %b, align 2
+  %var18 = sext <2 x i16> %var17 to <2 x i32>
+  %var19 = mul nsw <2 x i32> %var18, %var16
+  %var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19)
+  ret i32 %var20
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i64 @foov2i32(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vpmovsxdq (%rdx), %xmm1
+; CHECK-NEXT:    vpmullq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovq %xmm0, %rax
+; CHECK-NEXT:    retq
+  %var0 = load i32, ptr %a, align 1
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+  %var1 = load i32, ptr %arrayidx.1, align 1
+  %var8 = insertelement <2 x i32> poison, i32 %var0, i64 0
+  %var9 = insertelement <2 x i32> %var8, i32 %var1, i64 1
+  %var16 = zext <2 x i32> %var9 to <2 x i64>
+  %var17 = load <2 x i32>, ptr %b, align 2
+  %var18 = sext <2 x i32> %var17 to <2 x i64>
+  %var19 = mul nsw <2 x i64> %var18, %var16
+  %var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19)
+  ret i64 %var20
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #1
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) #1
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) #1

RKSimon

please can you regenerate the broken tests?

rohitaggarwal007 · 2025-04-10T04:48:36Z

please can you regenerate the broken tests?

Sure, will fix them

rohitaggarwal007 · 2025-04-17T09:23:00Z

Is it a good idea to move the logic in X86IselLowering.cpp so that just to enable for X86 incase other targets are regressing?

phoebewang · 2025-04-17T15:05:21Z

llvm/test/CodeGen/X86/buildvec-insertvec.ll

+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pinsrb $12, 3(%rdi), %xmm1
+; SSE41-NEXT:    pextrd $3, %xmm1, %eax


Seems also regress on X86?

Yes, Will handle this case also.

arsenm · 2025-04-18T10:32:33Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+      LLVM_DEBUG(
+          dbgs() << DAG.getMachineFunction().getFunction().getName()
+                 << " - Widening buildvector and replace zext with bitcast\n";
+          BV.dump(); Extend->dump(); dbgs() << " to \n";
+          NewBV.getNode()->dump(); NewBVBitcast->dump(););


I would remove this whole debug print. You should see the transform triggered in the generic dag combine case. You aslo don't need all of the manual dumping, you can directly use operator<<

Sure, I will fix it

rohitaggarwal007 · 2025-04-22T09:13:14Z

@RKSimon @phoebewang zext to bitcast is not beneficial for other targets. Bitcast is blocking there folding. We can take our optimization to X86isellowering and do the needful there and will handle all the regressing test cases for X86.
Should it sound good?

arsenm · 2025-04-23T10:37:43Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+// Try to widen the build vector and bitcast it to the type of zext.
+// This is a special case for the 128-bit vector types. Intention is to remove
+// the zext and replace it with a bitcast the wider type. While lowering
+// the bitcast is removed and extra commutation due to zext is avoided.


DAG format comment of the matched pattern

@arsenm Add such comment like zext(build_vec) -> bitcast(build_vec). which visualize the transformation, right?

RKSimon · 2025-04-24T12:24:40Z

@RKSimon @phoebewang zext to bitcast is not beneficial for other targets. Bitcast is blocking there folding. We can take our optimization to X86isellowering and do the needful there and will handle all the regressing test cases for X86. Should it sound good?

Sorry I missed this - will take a look

RKSimon · 2025-04-24T12:45:45Z

llvm/test/CodeGen/X86/WidenBuildVector.ll

@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mcpu=znver5 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s


You should be able to remove this WidenBuildVector.ll file now - I've pushed 8b2d269 which contains these (cleaned up) tests for a wider range of x86 targets - please can you merge with trunk to see the codegen diffs?

Sure will do it

rohitaggarwal007 · 2025-05-05T13:11:55Z

@RKSimon @phoebewang, I have handled the bad codegen cases for other targets.

phoebewang · 2025-05-06T02:52:13Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

@@ -14295,6 +14295,78 @@ static SDValue widenAbs(SDNode *Extend, SelectionDAG &DAG) {
  return DAG.getZExtOrTrunc(NewAbs, SDLoc(Extend), VT);
 }

+// Try to widen the build vector and bitcast it to the type of zext.
+// This is a special case for the 128-bit vector types. Intention is to remove


This looks like specific to X86, better to move to X86ISelLowering.

phoebewang · 2025-05-06T02:53:50Z

llvm/test/CodeGen/X86/avx512-i1test.ll

+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %bb35
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jmp .LBB0_1


No real change in the file. Should be reverted.

phoebewang · 2025-05-06T02:54:08Z

llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll

@@ -274,7 +274,7 @@ define i64 @PR55050() {
 ; X86-NEXT:  # %bb.1: # %if
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB15_2: # %exit
-; X86-NEXT:    movl    %eax, %edx
+; X86-NEXT:    movl %eax, %edx


…fic to X86

phoebewang

LGTM, thanks!

rohitaggarwal007 · 2025-05-06T07:19:46Z

@phoebewang Thanks
Can you please merge in main branch on my behalf?
I don't have the write permission.

…of widen Build Vector (llvm#135010) I am working on a problem in which a kernel is SLP vectorized and lead to generation of insertelements followed by zext. On lowering, the assembly looks like below: vmovd %r9d, %xmm0 vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0 vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0 vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0 vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0 vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0 vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0 vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0 vpmovzxbw %xmm0, %xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero vpmaddwd (%rdx), %xmm0, %xmm0 After all the insrb, xmm0 looks like xmm0=xmm0[0],xmm0[1],xmm0[2],xmm0[3],xmm0[4],xmm0[5],xmm0[6],xmm0[7],zero,zero,zero,zero,zero,zero,zero,zero Here vpmovzxbw perform the extension of i8 to i16. But it is expensive operation and I want to remove it. Optimization Place the value in correct location while inserting so that zext can be avoid. While lowering, we can write a custom lowerOperation for zero_extend_vector_inreg opcode. We can override the current default operation with my custom in the legalization step. The changes proposed are state below: vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0 vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0 vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0 vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0 vpinsrb $a, (%rdi,%rax), %xmm0, %xmm0 vpinsrb $c, (%rdi,%rcx,2), %xmm0, %xmm0 vpinsrb $e, (%rdi,%r8), %xmm0, %xmm0 # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero vpmaddwd (%rdx), %xmm0, %xmm0 More details in the discourse topic [https://discourse.llvm.org/t/improve-the-gathering-of-the-elements-so-that-unwanted-ext-operations-can-be-avoided/85443](url) --------- Co-authored-by: Rohit Aggarwal <[email protected]>

[X86][DAGCombiner][SelectionDAG] - Fold Zext Build Vector to Bitcast …

e5c1914

…of widen Build Vector

llvmbot added backend:X86 llvm:SelectionDAG SelectionDAGISel as well labels Apr 9, 2025

RKSimon requested review from arsenm, RKSimon and phoebewang April 9, 2025 17:48

RKSimon requested changes Apr 9, 2025

View reviewed changes

Fix for test cases failure

483c273

llvmbot added backend:PowerPC backend:SystemZ backend:WebAssembly labels Apr 16, 2025

rohitaggarwal007 requested a review from RKSimon April 16, 2025 13:52

phoebewang reviewed Apr 17, 2025

View reviewed changes

arsenm reviewed Apr 18, 2025

View reviewed changes

arsenm reviewed Apr 23, 2025

View reviewed changes

RKSimon added a commit that referenced this pull request Apr 24, 2025

[X86] Add extended test coverage for #135010

8b2d269

RKSimon reviewed Apr 24, 2025

View reviewed changes

rohitaggarwal007 and others added 4 commits April 25, 2025 14:54

Merge branch 'llvm:main' into ZextBuildVec

a48660c

Fix for the test case failure

ff669c6

Merge branch 'llvm:main' into ZextBuildVec

e776464

Fix the bad codegen

9dc5117

phoebewang reviewed May 6, 2025

View reviewed changes

Rohit Aggarwal added 2 commits May 6, 2025 10:39

Revert back two testcases formatting

dd28865

Move folding logic from DAGCombiner to X86ISelLowering as it is speci…

321b52e

…fic to X86

phoebewang approved these changes May 6, 2025

View reviewed changes

phoebewang merged commit fdbc30a into llvm:main May 6, 2025
11 checks passed

rohitaggarwal007 deleted the ZextBuildVec branch May 6, 2025 08:48

IanWood1 pushed a commit to IanWood1/llvm-project that referenced this pull request May 6, 2025

[X86] Add extended test coverage for llvm#135010

738f6a8

IanWood1 pushed a commit to IanWood1/llvm-project that referenced this pull request May 6, 2025

[X86] Add extended test coverage for llvm#135010

6eb126b

IanWood1 pushed a commit to IanWood1/llvm-project that referenced this pull request May 6, 2025

[X86] Add extended test coverage for llvm#135010

f0b93b3

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[X86][DAGCombiner][SelectionDAG] - Fold Zext Build Vector to Bitcast of widen Build Vector #135010

[X86][DAGCombiner][SelectionDAG] - Fold Zext Build Vector to Bitcast of widen Build Vector #135010

rohitaggarwal007 commented Apr 9, 2025

llvmbot commented Apr 9, 2025 •

edited

Loading

RKSimon left a comment

rohitaggarwal007 commented Apr 10, 2025

rohitaggarwal007 commented Apr 17, 2025

phoebewang Apr 17, 2025

rohitaggarwal007 Apr 18, 2025

arsenm Apr 18, 2025

rohitaggarwal007 Apr 25, 2025

rohitaggarwal007 commented Apr 22, 2025

arsenm Apr 23, 2025

rohitaggarwal007 Apr 25, 2025

RKSimon commented Apr 24, 2025

RKSimon Apr 24, 2025

rohitaggarwal007 Apr 25, 2025

rohitaggarwal007 commented May 5, 2025

phoebewang May 6, 2025

rohitaggarwal007 May 6, 2025

phoebewang May 6, 2025

rohitaggarwal007 May 6, 2025

phoebewang May 6, 2025

rohitaggarwal007 May 6, 2025

phoebewang left a comment

rohitaggarwal007 commented May 6, 2025

		@@ -0,0 +1,258 @@
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
		; RUN: llc -mcpu=znver5 -mtriple=x86_64-unknown-unknown < %s \| FileCheck %s

[X86][DAGCombiner][SelectionDAG] - Fold Zext Build Vector to Bitcast of widen Build Vector #135010

[X86][DAGCombiner][SelectionDAG] - Fold Zext Build Vector to Bitcast of widen Build Vector #135010

Conversation

rohitaggarwal007 commented Apr 9, 2025

llvmbot commented Apr 9, 2025 • edited Loading

RKSimon left a comment

Choose a reason for hiding this comment

rohitaggarwal007 commented Apr 10, 2025

rohitaggarwal007 commented Apr 17, 2025

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

rohitaggarwal007 commented Apr 22, 2025

Choose a reason for hiding this comment

Choose a reason for hiding this comment

RKSimon commented Apr 24, 2025

Choose a reason for hiding this comment

Choose a reason for hiding this comment

rohitaggarwal007 commented May 5, 2025

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

phoebewang left a comment

Choose a reason for hiding this comment

rohitaggarwal007 commented May 6, 2025

llvmbot commented Apr 9, 2025 •

edited

Loading