-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[VPlan] Compute cost of replicating calls in VPlan. (NFCI) #154291
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesImplement computing the scalarization overhead for replicating calls in Depends on #154126. (Included in the PR) Full diff: https://github.com/llvm/llvm-project/pull/154291.diff 6 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9186419715cc4..5b3e42908c58f 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -961,12 +961,10 @@ class TargetTransformInfo {
TTI::TargetCostKind CostKind, bool ForPoisonSrc = true,
ArrayRef<Value *> VL = {}) const;
- /// Estimate the overhead of scalarizing an instructions unique
- /// non-constant operands. The (potentially vector) types to use for each of
- /// argument are passes via Tys.
+ /// Estimate the overhead of scalarizing operands with the given types. The
+ /// (potentially vector) types to use for each of argument are passes via Tys.
LLVM_ABI InstructionCost getOperandsScalarizationOverhead(
- ArrayRef<const Value *> Args, ArrayRef<Type *> Tys,
- TTI::TargetCostKind CostKind) const;
+ ArrayRef<Type *> Tys, TTI::TargetCostKind CostKind) const;
/// If target has efficient vector element load/store instructions, it can
/// return true here so that insertion/extraction costs are not added to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 200cbafbaa6e2..183f1692746ce 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -459,8 +459,7 @@ class TargetTransformInfoImplBase {
}
virtual InstructionCost
- getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
- ArrayRef<Type *> Tys,
+ getOperandsScalarizationOverhead(ArrayRef<Type *> Tys,
TTI::TargetCostKind CostKind) const {
return 0;
}
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index aa9d1f0a1ccea..4a02ae4f8fcfb 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -18,6 +18,7 @@
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -347,6 +348,21 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Cost;
}
+ /// Filter out constant and duplicated entries in \p Ops and return a vector
+ /// containing the corresponding types.
+ static SmallVector<Type *, 4>
+ filterConstantAndDuplicatedOperands(ArrayRef<const Value *> Ops,
+ ArrayRef<Type *> Tys) {
+ SmallPtrSet<const Value *, 4> UniqueOperands;
+ SmallVector<Type *, 4> FilteredTys;
+ for (const auto &[Op, Ty] : zip_equal(Ops, Tys)) {
+ if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second)
+ continue;
+ FilteredTys.push_back(Ty);
+ }
+ return FilteredTys;
+ }
+
protected:
explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
: BaseT(DL) {}
@@ -935,29 +951,21 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
CostKind);
}
- /// Estimate the overhead of scalarizing an instructions unique
- /// non-constant operands. The (potentially vector) types to use for each of
+ /// Estimate the overhead of scalarizing an instructions
+ /// operands. The (potentially vector) types to use for each of
/// argument are passes via Tys.
InstructionCost getOperandsScalarizationOverhead(
- ArrayRef<const Value *> Args, ArrayRef<Type *> Tys,
- TTI::TargetCostKind CostKind) const override {
- assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
-
+ ArrayRef<Type *> Tys, TTI::TargetCostKind CostKind) const override {
InstructionCost Cost = 0;
- SmallPtrSet<const Value*, 4> UniqueOperands;
- for (int I = 0, E = Args.size(); I != E; I++) {
+ for (Type *Ty : Tys) {
// Disregard things like metadata arguments.
- const Value *A = Args[I];
- Type *Ty = Tys[I];
if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
!Ty->isPtrOrPtrVectorTy())
continue;
- if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
- if (auto *VecTy = dyn_cast<VectorType>(Ty))
- Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
- /*Extract*/ true, CostKind);
- }
+ if (auto *VecTy = dyn_cast<VectorType>(Ty))
+ Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
+ /*Extract*/ true, CostKind);
}
return Cost;
@@ -974,7 +982,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost Cost = getScalarizationOverhead(
RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
if (!Args.empty())
- Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind);
+ Cost += getOperandsScalarizationOverhead(
+ filterConstantAndDuplicatedOperands(Args, Tys), CostKind);
else
// When no information on arguments is provided, we add the cost
// associated with one argument as a heuristic.
@@ -2156,8 +2165,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
/*Insert=*/true, /*Extract=*/false, CostKind);
}
}
- ScalarizationCost +=
- getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
+ ScalarizationCost += getOperandsScalarizationOverhead(
+ filterConstantAndDuplicatedOperands(Args, ICA.getArgTypes()),
+ CostKind);
}
IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3141060a710ce..296209a3f917c 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -637,9 +637,8 @@ InstructionCost TargetTransformInfo::getScalarizationOverhead(
}
InstructionCost TargetTransformInfo::getOperandsScalarizationOverhead(
- ArrayRef<const Value *> Args, ArrayRef<Type *> Tys,
- TTI::TargetCostKind CostKind) const {
- return TTIImpl->getOperandsScalarizationOverhead(Args, Tys, CostKind);
+ ArrayRef<Type *> Tys, TTI::TargetCostKind CostKind) const {
+ return TTIImpl->getOperandsScalarizationOverhead(Tys, CostKind);
}
bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8894b1692d562..5c811a1a45b52 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1697,8 +1697,16 @@ class LoopVectorizationCostModel {
/// Returns a range containing only operands needing to be extracted.
SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
ElementCount VF) const {
- return SmallVector<Value *, 4>(make_filter_range(
- Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
+
+ SmallPtrSet<const Value *, 4> UniqueOperands;
+ SmallVector<Value *, 4> Res;
+ for (Value *Op : Ops) {
+ if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second ||
+ !needsExtract(Op, VF))
+ continue;
+ Res.push_back(Op);
+ }
+ return Res;
}
public:
@@ -5610,8 +5618,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
SmallVector<Type *> Tys;
for (auto *V : filterExtractingOperands(Ops, VF))
Tys.push_back(maybeVectorizeType(V->getType(), VF));
- return Cost + TTI.getOperandsScalarizationOverhead(
- filterExtractingOperands(Ops, VF), Tys, CostKind);
+ return Cost + TTI.getOperandsScalarizationOverhead(Tys, CostKind);
}
void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index acd6a97344116..e3b5be879f61f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2975,12 +2975,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
// instruction cost.
return 0;
case Instruction::Call: {
- if (!isSingleScalar()) {
- // TODO: Handle remaining call costs here as well.
- if (VF.isScalable())
- return InstructionCost::getInvalid();
- break;
- }
+ if (!isSingleScalar() && VF.isScalable())
+ return InstructionCost::getInvalid();
auto *CalledFn =
cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
@@ -2990,7 +2986,37 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
SmallVector<Type *, 4> Tys;
for (VPValue *ArgOp : drop_end(operands()))
Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
- return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+
+ InstructionCost ScalarCallCost =
+ Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+ if (isSingleScalar())
+ return ScalarCallCost;
+
+ // Compute the cost of scalarizing the result and operands if needed.
+ InstructionCost ScalarizationCost = 0;
+ if (VF.isVector()) {
+ if (!ResultTy->isVoidTy()) {
+ for (Type *VectorTy : getContainedTypes(toVectorizedTy(ResultTy, VF))) {
+ ScalarizationCost += Ctx.TTI.getScalarizationOverhead(
+ cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
+ /*Insert=*/true,
+ /*Extract=*/false, Ctx.CostKind);
+ }
+ }
+ // Compute the cost of scalarizing the operands that require extraction.
+ SmallVector<Type *> Tys;
+ SmallPtrSet<const VPValue *, 4> UniqueOperands;
+ for (auto *Op : drop_end(operands())) {
+ if (isa<VPReplicateRecipe>(Op) || !UniqueOperands.insert(Op).second)
+ continue;
+ Tys.push_back(toVectorizedTy(Ctx.Types.inferScalarType(Op), VF));
+ }
+ ScalarizationCost +=
+ Ctx.TTI.getOperandsScalarizationOverhead(Tys, Ctx.CostKind);
+ }
+
+ return ScalarCallCost * (isSingleScalar() ? 1 : VF.getFixedValue()) +
+ ScalarizationCost;
}
case Instruction::Add:
case Instruction::Sub:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I could be mis-reading the indentation here, but if it was single scalar, wouldn't this case have exited early at line 2992?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep, removed, thanks
Implement computing the scalarization overhead for replicating calls in VPlan, matching the legacy cost model. Depends on llvm#154126.
ee47be3
to
93dfdaf
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for this! Nice to see the legacy cost model being removed in more places. :) I just had one comment about the existing code.
Type *ResultTy = Ctx.Types.inferScalarType(this); | ||
return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); | ||
InstructionCost ScalarCallCost = | ||
Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if the old code was even correct? It looks like in the legacy cost model (LoopVectorizationCostModel::getVectorCallCost
) we selected the min of getVectorIntrinsicCost
and TTI.getCallInstrCost
if we're invoking an intrinsic.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The current code does not handle intrinsics for now (see bail-out at line 3008).
The twist with intrinsics is that if we chose the intrinsic cost, we only create VPReplicateRecipe for various pseudo-intrinsics. I'll add support for intrinsics as follow-up
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes, sorry I missed that! OK great and thanks for doing this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just had a couple of nits, but LGTM!
SmallVector<Type *, 4> Tys; | ||
for (VPValue *ArgOp : drop_end(operands())) | ||
Tys.push_back(Ctx.Types.inferScalarType(ArgOp)); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: unrelated whitespace
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed thanks
} | ||
// Skip operands that do not require extraction/scalarization and do not | ||
// incur any overhead. | ||
SmallVector<Type *> Tys; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Should this be SmallVector<Type *, 4>
to be consistent with UniqueOperands
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe the existing Tys SmallVector could be reused, with a clear() call? Or do you think you'll need the scalar types again?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Used clear for now, thanks!
… (#154291) Implement computing the scalarization overhead for replicating calls in VPlan, matching the legacy cost model. Depends on llvm/llvm-project#154126. PR: llvm/llvm-project#154291
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/65/builds/21782 Here is the relevant piece of the build log for the reference
|
…CI). Handle intrinsic calls in VPReplicateRecipe::computeCost. There are some intrinsics pseudo intrinsics for which the computed cost is known zero, so we handle those up front. Depends on llvm#154291.
…eRecipe (NFCI). (#154617) Handle intrinsic calls in VPReplicateRecipe::computeCost. There are some intrinsics pseudo intrinsics for which the computed cost is known zero, so we handle those up front. Depends on llvm/llvm-project#154291. PR: llvm/llvm-project#154617
Implement computing the scalarization overhead for replicating calls in
VPlan, matching the legacy cost model.
Depends on #154126. (Included in the PR)