From 1669e06ec25259ab0cf20591f3b521b84e8e26ce Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 12 Sep 2025 21:34:11 +0100 Subject: [PATCH 1/2] [VPlan] Introduce chainUsesScalarValues Introduce chainUsesScalarValues to dig through a recipe-chain, skipping widening decisions, and determine if the final leaves use only scalar values of the given root. Demonstrate its utility in narrowToSingleScalarRecipes, showing that it is essentially a drop-in replacement for onlyScalarValuesUsed for the purposes of optimizations. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 12 ++-- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 25 ++++++++ llvm/lib/Transforms/Vectorize/VPlanUtils.h | 5 ++ .../AArch64/conditional-branches-cost.ll | 64 ++++++++----------- .../extractvalue-no-scalarization-required.ll | 10 ++- .../AArch64/sve-widen-extractvalue.ll | 8 +-- .../LoopVectorize/RISCV/dead-ops-cost.ll | 6 +- .../LoopVectorize/X86/cost-model.ll | 14 ++-- ...licate-recipe-with-only-first-lane-used.ll | 8 +-- .../LoopVectorize/X86/uniform_load.ll | 39 ++++------- ...eref-pred-poison-ub-ops-feeding-pointer.ll | 18 +++--- llvm/test/Transforms/LoopVectorize/pr50686.ll | 18 ++---- .../LoopVectorize/single-scalar-cast-minbw.ll | 8 +-- 13 files changed, 109 insertions(+), 126 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 2cac5557daeee..92add4a96a5ed 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1322,15 +1322,11 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { continue; } - // Skip recipes that aren't single scalars or don't have only their - // scalar results used. In the latter case, we would introduce extra - // broadcasts. + // Only consider recipes that are single scalars whose scalar value is + // used by the final leaves in a recipe-chain walk: if the final leaves + // don't use the scalar value, it could introduce extra broadcasts. if (!vputils::isSingleScalar(RepOrWidenR) || - !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) { - return U->usesScalars(RepOrWidenR) || - match(cast(U), - m_ExtractLastElement(m_VPValue())); - })) + !vputils::chainUsesScalarValues(RepOrWidenR)) continue; auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(), diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index c6c1ef3369825..3f21063330b03 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -29,6 +29,31 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) { [Def](const VPUser *U) { return U->usesScalars(Def); }); } +bool vputils::chainUsesScalarValues(const VPValue *Root) { + SmallVector> Worklist; + for (const VPUser *V : Root->users()) + Worklist.emplace_back(Root, V); + while (!Worklist.empty()) { + auto [Op, U] = Worklist.pop_back_val(); + if (isa(U)) { + const VPValue *Def = cast(U)->getVPSingleValue(); + for (const VPUser *V : Def->users()) + Worklist.emplace_back(Def, V); + continue; + } + if (auto *Store = dyn_cast(U)) + if (vputils::isSingleScalar(Store->getStoredValue())) + continue; + if (auto *VPI = dyn_cast(U)) + if (VPI->isVectorToScalar()) + continue; + if (!U->usesScalars(Op)) + return false; + } + return true; +} + VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { if (auto *Expanded = Plan.getSCEVExpansion(Expr)) return Expanded; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 77c099b271717..8fcb51ec98ade 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -28,6 +28,11 @@ bool onlyFirstPartUsed(const VPValue *Def); /// Returns true if only scalar values of \p Def are used by all users. bool onlyScalarValuesUsed(const VPValue *Def); +/// Digs through a chain of recipes starting from \p Root, skipping widening +/// decisions, and determines if the final leaves use only scalar values of \p +/// Root. +bool chainUsesScalarValues(const VPValue *Root); + /// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p /// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in /// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 884eeac09e1e3..c38f80a994e89 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -428,48 +428,36 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt ; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] ; DEFAULT: [[VECTOR_BODY]]: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE37:.*]] ] -; DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> poison, <4 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP19:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP19]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP6:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]] -; DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT31:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT30]], <4 x i32> poison, <4 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP8:%.*]] = icmp ugt <4 x i32> [[BROADCAST_SPLAT31]], [[TMP6]] +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE33:.*]] ] +; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]] +; DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]] +; DEFAULT-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]] +; DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]] +; DEFAULT-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP5]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[D]], i64 [[INDEX]] -; DEFAULT-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 -; DEFAULT-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; DEFAULT: [[PRED_STORE_IF]]: -; DEFAULT-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 -; DEFAULT-NEXT: store i32 [[TMP11]], ptr [[E]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]] +; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]] ; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]] ; DEFAULT: [[PRED_STORE_CONTINUE]]: -; DEFAULT-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1 -; DEFAULT-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]] +; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]] +; DEFAULT: [[PRED_STORE_IF28]]: +; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE29]] +; DEFAULT: [[PRED_STORE_CONTINUE29]]: +; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]] +; DEFAULT: [[PRED_STORE_IF30]]: +; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE31]] +; DEFAULT: [[PRED_STORE_CONTINUE31]]: +; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33]] ; DEFAULT: [[PRED_STORE_IF32]]: -; DEFAULT-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 -; DEFAULT-NEXT: store i32 [[TMP13]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]] ; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE33]] ; DEFAULT: [[PRED_STORE_CONTINUE33]]: -; DEFAULT-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2 -; DEFAULT-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35:.*]] -; DEFAULT: [[PRED_STORE_IF34]]: -; DEFAULT-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 -; DEFAULT-NEXT: store i32 [[TMP15]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]] -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE35]] -; DEFAULT: [[PRED_STORE_CONTINUE35]]: -; DEFAULT-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3 -; DEFAULT-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF36:.*]], label %[[PRED_STORE_CONTINUE37]] -; DEFAULT: [[PRED_STORE_IF36]]: -; DEFAULT-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 -; DEFAULT-NEXT: store i32 [[TMP22]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]] -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE37]] -; DEFAULT: [[PRED_STORE_CONTINUE37]]: -; DEFAULT-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP16]], i32 4, <4 x i1> [[TMP8]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]] +; DEFAULT-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP16]], i32 4, <4 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]] ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; DEFAULT-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] @@ -727,10 +715,10 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP15]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer ; DEFAULT-NEXT: [[TMP16:%.*]] = load float, ptr [[SRC_2]], align 4 -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP16]], i64 0 +; DEFAULT-NEXT: [[TMP17:%.*]] = fmul float [[TMP16]], 0.000000e+00 +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP17]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP17:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer -; DEFAULT-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP17]]) +; DEFAULT-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[BROADCAST_SPLAT]]) ; DEFAULT-NEXT: [[TMP19:%.*]] = load float, ptr [[SRC_3]], align 4 ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT10]], <8 x float> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll index efcd810203a44..379825bbbab88 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll @@ -16,7 +16,7 @@ ; CM: vector.ph: ; CM: CLONE ir<%a> = extractvalue ir<%sv> ; CM: CLONE ir<%b> = extractvalue ir<%sv> -; CM: WIDEN ir<%add> = add ir<%a>, ir<%b> +; CM: CLONE ir<%add> = add ir<%a>, ir<%b> ; CM: Successor(s): vector loop ; CM: LV: Scalar loop costs: 5. @@ -30,17 +30,15 @@ define void @test1(ptr %dst, {i64, i64} %sv) { ; FORCED-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; FORCED: [[VECTOR_PH]]: ; FORCED-NEXT: [[TMP0:%.*]] = extractvalue { i64, i64 } [[SV]], 0 -; FORCED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 -; FORCED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; FORCED-NEXT: [[TMP4:%.*]] = extractvalue { i64, i64 } [[SV]], 1 -; FORCED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0 +; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[TMP0]], [[TMP4]] +; FORCED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0 ; FORCED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer -; FORCED-NEXT: [[TMP1:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]] ; FORCED-NEXT: br label %[[VECTOR_BODY:.*]] ; FORCED: [[VECTOR_BODY]]: ; FORCED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; FORCED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]] -; FORCED-NEXT: store <2 x i64> [[TMP1]], ptr [[TMP2]], align 4 +; FORCED-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP2]], align 4 ; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; FORCED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 ; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll index 0c6a490ddf4ba..eceda0897b174 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll @@ -17,17 +17,15 @@ define void @widen_extractvalue(ptr %dst, {i64, i64} %sv) #0 { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1000, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 1000, [[N_MOD_VF]] ; CHECK-NEXT: [[EXTRACT0:%.*]] = extractvalue { i64, i64 } [[SV]], 0 -; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement poison, i64 [[EXTRACT0]], i64 0 -; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector [[DOTSPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { i64, i64 } [[SV]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[EXTRACT0]], [[TMP10]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = add [[DOTSPLAT2]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]] -; CHECK-NEXT: store [[TMP7]], ptr [[TMP8]], align 8 +; CHECK-NEXT: store [[BROADCAST_SPLAT2]], ptr [[TMP8]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index e014b9ddbcd0e..95e68d6cc300a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -304,9 +304,9 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = xor i32 [[A]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP0]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = xor [[BROADCAST_SPLAT]], splat (i32 -1) ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i64 9) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] @@ -320,7 +320,7 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP19]], align 4 [[TMP16]], splat (i1 true), i32 [[TMP8]]) +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], align 4 [[TMP16]], splat (i1 true), i32 [[TMP8]]) ; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP8]] to i64 ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 391653a2efe34..1b171dac3c655 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -335,7 +335,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]] ; CHECK-NEXT: [[UMIN7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[A:%.*]]) ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[UMIN7]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 28 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 14 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[B]], i64 1) @@ -370,14 +370,12 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[SRC_1]], align 8, !alias.scope [[META6:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP13]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META9:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP14]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT9]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT10]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = and <2 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[TMP13]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP14]], 0 +; CHECK-NEXT: [[TMP21:%.*]] = and i1 [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[TMP21]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i1> [[TMP17]] to <2 x i8> ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i8> [[TMP18]], i32 1 ; CHECK-NEXT: store i8 [[TMP19]], ptr [[DST]], align 1, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll index b35e211ff0d53..77b56e7643533 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll @@ -201,10 +201,6 @@ define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) { ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i1> poison, i1 [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i1> [[TMP8]], i1 [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i1> [[TMP9]], i1 [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 3 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] @@ -213,7 +209,9 @@ define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) { ; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP16]] ; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP17]] ; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP14]], <4 x float> splat (float 1.000000e+01), <4 x float> splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP10]], float 1.000000e+01, float 1.000000e+00 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP36]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP19]], align 4 ; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP20]], align 4 ; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[TMP21]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll index 62d08c8668235..4f685ab082e67 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll @@ -67,39 +67,24 @@ exit: define void @uniform_load_can_fold_users(ptr noalias %src, ptr %dst, i64 %start, double %d) { ; CHECK-LABEL: define void @uniform_load_can_fold_users( ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[START:%.*]], double [[D:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[START]], 1 -; CHECK-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[START]], i64 0) -; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 2 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[SRC]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[BROADCAST_SPLAT]], splat (double 9.000000e+00) -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fmul double [[TMP5]], 9.000000e+00 ; CHECK-NEXT: [[TMP8:%.*]] = fdiv double [[TMP7]], [[D]] -; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP3]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP4]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP11]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP12]], i64 [[TMP10]] -; CHECK-NEXT: store double [[TMP8]], ptr [[TMP13]], align 8 ; CHECK-NEXT: store double [[TMP8]], ptr [[TMP14]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[IV_1_NEXT]] = add i64 [[TMP4]], 1 +; CHECK-NEXT: [[IV_2_NEXT]] = add i64 [[IV_2]], -1 +; CHECK-NEXT: [[EC:%.*]] = icmp sgt i64 [[IV_2]], 0 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll index 70e730f0284c0..a2a5d56ff1bfb 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll @@ -105,10 +105,10 @@ define void @ptr_depends_on_possibly_poison_value(ptr noalias %dst, i16 %off) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[OFF]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = sub i16 1, [[OFF]] +; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[TMP0]], [[OFF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = sub <2 x i16> splat (i16 1), [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i16> [[TMP0]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ] @@ -116,7 +116,7 @@ define void @ptr_depends_on_possibly_poison_value(ptr noalias %dst, i16 %off) { ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 9, [[DOTCAST]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i16> [[VEC_IND]], splat (i16 10) -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i16> [[VEC_IND]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 ; CHECK-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: @@ -257,12 +257,12 @@ define void @ptr_depends_on_possibly_poison_value_from_load(ptr noalias %dst) { ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 9, [[DOTCAST]] ; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr @src, align 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i16> [[VEC_IND]], splat (i16 10) -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i16> splat (i16 1), [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i16> [[TMP2]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i16> [[VEC_IND]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i16 1, [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = add i16 [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 ; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: diff --git a/llvm/test/Transforms/LoopVectorize/pr50686.ll b/llvm/test/Transforms/LoopVectorize/pr50686.ll index 14bcfde5b4423..1de075af4389d 100644 --- a/llvm/test/Transforms/LoopVectorize/pr50686.ll +++ b/llvm/test/Transforms/LoopVectorize/pr50686.ll @@ -18,20 +18,16 @@ define void @m(ptr nocapture %p, ptr nocapture %p2, i32 %q) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> zeroinitializer, [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope [[META0]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP2]], [[BROADCAST_SPLAT3]] -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = sub nsw i32 0, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope [[META0]] +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0]] +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw i32 [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP4]], [[BROADCAST_SPLAT5]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDEX]] -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP7]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP7]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll index 9a699826696ec..70adac2103feb 100644 --- a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll +++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll @@ -84,12 +84,8 @@ define void @single_scalar_cast_stored(ptr %src, ptr %dst, i32 %n) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2, !alias.scope [[META4:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i16> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i16> [[BROADCAST_SPLAT]], splat (i16 15) -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i16 [[TMP0]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = and i16 [[TMP0]], 15 ; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP4]] ; CHECK-NEXT: store i16 [[TMP5]], ptr [[DST]], align 2, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 From a95f5ca3280fbe823d94f4f17a75294cc7b195e0 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Sat, 13 Sep 2025 15:01:50 +0100 Subject: [PATCH 2/2] [VPlan] Fix WidenStore-related thinko; extend VPI --- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 3f21063330b03..6cb9a978b4cb8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -37,16 +37,15 @@ bool vputils::chainUsesScalarValues(const VPValue *Root) { auto [Op, U] = Worklist.pop_back_val(); if (isa(U)) { - const VPValue *Def = cast(U)->getVPSingleValue(); + const VPValue *Def = cast(U); for (const VPUser *V : Def->users()) Worklist.emplace_back(Def, V); continue; } - if (auto *Store = dyn_cast(U)) - if (vputils::isSingleScalar(Store->getStoredValue())) - continue; + if (isa(U) && vputils::isSingleScalar(Op)) + continue; if (auto *VPI = dyn_cast(U)) - if (VPI->isVectorToScalar()) + if (VPI->isVectorToScalar() || VPI->isSingleScalar()) continue; if (!U->usesScalars(Op)) return false;