74#ifdef EXPENSIVE_CHECKS
108using namespace std::placeholders;
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
113STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116 "Controls which SLP graphs should be vectorized.");
120 cl::desc(
"Run the SLP vectorization passes"));
124 cl::desc(
"Enable vectorization for wider vector utilization"));
128 cl::desc(
"Only vectorize if you gain more than this "
133 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
163 cl::desc(
"Limit the size of the SLP scheduling region per block"));
167 cl::desc(
"Attempt to vectorize for this register size in bits"));
171 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
175 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
181 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
190 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
194 cl::desc(
"The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
199 cl::desc(
"The maximum stride, considered to be profitable."));
203 cl::desc(
"Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
208 cl::desc(
"Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
213 cl::desc(
"Display the SLP trees with Graphviz"));
217 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
222 cl::desc(
"Try to replace values with the idempotent instructions for "
223 "better vectorization."));
255 Ty = Ty->getScalarType();
257 !Ty->isPPC_FP128Ty();
266 return SI->getValueOperand()->getType();
268 return CI->getOperand(0)->getType();
270 return IE->getOperand(1)->getType();
277 "ScalableVectorType is not supported.");
279 return VecTy->getNumElements();
293 Type *Ty,
unsigned Sz) {
298 if (NumParts == 0 || NumParts >= Sz)
313 if (NumParts == 0 || NumParts >= Sz)
318 return (Sz / RegVF) * RegVF;
330 I * VecTyNumElements, VecTyNumElements)))
332 : Mask[
I] * VecTyNumElements + J;
366 unsigned SVNumElements =
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
374 unsigned NumGroup = 0;
375 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
377 Value *Src = SV->getOperand(0);
383 if (SV->getOperand(0) != Src)
386 if (!SV->isExtractSubvectorMask(Index))
388 ExpectedIndex.
set(Index / ShuffleMaskSize);
392 if (!ExpectedIndex.
all())
396 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
415 unsigned SVNumElements =
418 unsigned AccumulateLength = 0;
419 for (
Value *V : VL) {
421 for (
int M : SV->getShuffleMask())
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
465 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
474 OS <<
"Idx: " << Idx <<
", ";
475 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
498 if (BB !=
II->getParent())
515 Value *FirstNonUndef =
nullptr;
516 for (
Value *V : VL) {
519 if (!FirstNonUndef) {
523 if (V != FirstNonUndef)
526 return FirstNonUndef !=
nullptr;
542 return Cmp->isCommutative();
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
564 (BO->getOpcode() == Instruction::FSub &&
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
570 return I->isCommutative();
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
593 return I->getNumOperands();
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
610 if (CI->getValue().uge(VT->getNumElements()))
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
635 Type *CurrentType =
IV->getType();
636 for (
unsigned I :
IV->indices()) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(
I);
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
664 return std::all_of(It, VL.
end(), [&](
Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
697 if (MaskArg == UseMask::UndefsAsMask)
701 if (MaskArg == UseMask::FirstArg &&
Value < VF)
702 UseMask.reset(
Value);
703 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
704 UseMask.reset(
Value - VF);
712template <
bool IsPoisonOnly = false>
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
724 if (!UseMask.empty()) {
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
750 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
751 if (
Constant *Elem =
C->getAggregateElement(
I))
753 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
781static std::optional<TargetTransformInfo::ShuffleKind>
788 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
795 return std::max(S, VTy->getNumElements());
798 Value *Vec1 =
nullptr;
799 Value *Vec2 =
nullptr;
804 Value *Vec = EE->getVectorOperand();
810 ShuffleMode CommonShuffleMode =
Unknown;
812 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
819 auto *Vec = EI->getVectorOperand();
833 if (Idx->getValue().uge(
Size))
835 unsigned IntIdx = Idx->getValue().getZExtValue();
842 if (!Vec1 || Vec1 == Vec) {
844 }
else if (!Vec2 || Vec2 == Vec) {
850 if (CommonShuffleMode == Permute)
854 if (Mask[
I] %
Size !=
I) {
855 CommonShuffleMode = Permute;
858 CommonShuffleMode =
Select;
861 if (CommonShuffleMode ==
Select && Vec2)
871 unsigned Opcode =
E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
879 return CI->getZExtValue();
882 if (EI->getNumIndices() != 1)
884 return *EI->idx_begin();
910bool isValidForAlternation(
unsigned Opcode) {
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
934 MainOpBIT = 0b100000000,
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(
const Instruction *
I) {
944 unsigned Opcode =
I->getOpcode();
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
957 struct InterchangeableInfo {
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(
const Instruction *I) : I(I) {}
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
979 bool equal(
unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1006 bool hasCandidateOpcode(
unsigned Opcode)
const {
1007 MaskType Candidate = Mask & SeenBefore;
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1058 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1061 : APInt::getZero(FromCIValueBitWidth);
1064 case Instruction::Mul:
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1069 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1072 : APInt::getZero(FromCIValueBitWidth);
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.
isZero()) {
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1086 case Instruction::And:
1088 ToCIValue = ToOpcode == Instruction::Mul
1090 : APInt::getZero(FromCIValueBitWidth);
1093 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1097 Value *
LHS = I->getOperand(1 - Pos);
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1103 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1104 FromOpcode == Instruction::Xor) &&
1105 ToOpcode == Instruction::Sub))
1110 InterchangeableInfo MainOp;
1111 InterchangeableInfo AltOp;
1112 bool isValidForAlternation(
const Instruction *
I)
const {
1113 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1114 ::isValidForAlternation(
I->getOpcode());
1116 bool initializeAltOp(
const Instruction *
I) {
1119 if (!isValidForAlternation(
I))
1126 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1127 const Instruction *AltOp =
nullptr)
1128 : MainOp(MainOp), AltOp(AltOp) {
1131 bool add(
const Instruction *
I) {
1133 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1134 unsigned Opcode =
I->getOpcode();
1135 MaskType OpcodeInMaskForm;
1138 case Instruction::Shl:
1139 OpcodeInMaskForm = ShlBIT;
1141 case Instruction::AShr:
1142 OpcodeInMaskForm = AShrBIT;
1144 case Instruction::Mul:
1145 OpcodeInMaskForm = MulBIT;
1147 case Instruction::Add:
1148 OpcodeInMaskForm = AddBIT;
1150 case Instruction::Sub:
1151 OpcodeInMaskForm = SubBIT;
1153 case Instruction::And:
1154 OpcodeInMaskForm = AndBIT;
1156 case Instruction::Or:
1157 OpcodeInMaskForm = OrBIT;
1159 case Instruction::Xor:
1160 OpcodeInMaskForm = XorBIT;
1163 return MainOp.equal(Opcode) ||
1164 (initializeAltOp(
I) && AltOp.equal(Opcode));
1166 MaskType InterchangeableMask = OpcodeInMaskForm;
1167 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1169 constexpr MaskType CanBeAll =
1170 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1171 const APInt &CIValue = CI->
getValue();
1173 case Instruction::Shl:
1175 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1177 case Instruction::Mul:
1178 if (CIValue.
isOne()) {
1179 InterchangeableMask = CanBeAll;
1183 InterchangeableMask = MulBIT | ShlBIT;
1185 case Instruction::Add:
1186 case Instruction::Sub:
1187 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1189 case Instruction::And:
1191 InterchangeableMask = CanBeAll;
1193 case Instruction::Xor:
1195 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1199 InterchangeableMask = CanBeAll;
1203 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1204 (initializeAltOp(
I) &&
1205 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1207 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1209 bool hasCandidateOpcode(
unsigned Opcode)
const {
1210 return MainOp.hasCandidateOpcode(Opcode);
1212 bool hasAltOp()
const {
return AltOp.I; }
1213 unsigned getAltOpcode()
const {
1214 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1217 return MainOp.getOperand(
I);
1222class InstructionsState {
1248 bool HasCopyables =
false;
1252 assert(valid() &&
"InstructionsState is invalid.");
1257 assert(valid() &&
"InstructionsState is invalid.");
1262 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1264 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1267 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1276 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1277 assert(MainOp &&
"MainOp cannot be nullptr.");
1278 if (
I->getOpcode() == MainOp->getOpcode())
1281 assert(AltOp &&
"AltOp cannot be nullptr.");
1282 if (
I->getOpcode() == AltOp->getOpcode())
1284 if (!
I->isBinaryOp())
1286 BinOpSameOpcodeHelper
Converter(MainOp);
1289 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1290 BinOpSameOpcodeHelper AltConverter(AltOp);
1291 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1292 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1295 if (
Converter.hasAltOp() && !isAltShuffle())
1297 return Converter.hasAltOp() ? AltOp : MainOp;
1301 bool isShiftOp()
const {
1302 return getMainOp()->isShift() && getAltOp()->isShift();
1307 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1311 bool isMulDivLikeOp()
const {
1312 constexpr std::array<unsigned, 8> MulDiv = {
1313 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1314 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1315 Instruction::URem, Instruction::FRem};
1321 bool isAddSubLikeOp()
const {
1322 constexpr std::array<unsigned, 4>
AddSub = {
1323 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1330 bool isCmpOp()
const {
1331 return (
getOpcode() == Instruction::ICmp ||
1337 bool valid()
const {
return MainOp && AltOp; }
1339 explicit operator bool()
const {
return valid(); }
1341 InstructionsState() =
delete;
1342 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1343 bool HasCopyables =
false)
1344 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1345 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1348 bool isCopyableElement(
Value *V)
const {
1349 assert(valid() &&
"InstructionsState is invalid.");
1352 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1357 if (
I->getParent() != MainOp->getParent() &&
1361 if (
I->getOpcode() == MainOp->getOpcode())
1363 if (!
I->isBinaryOp())
1365 BinOpSameOpcodeHelper
Converter(MainOp);
1371 bool isNonSchedulable(
Value *V)
const {
1372 assert(valid() &&
"InstructionsState is invalid.");
1379 if (getMainOp() == V)
1381 if (isCopyableElement(V)) {
1382 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1384 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1389 !MainOp->comesBefore(
I));
1392 return IsNonSchedulableCopyableElement(V);
1399 bool areInstructionsWithCopyableElements()
const {
1400 assert(valid() &&
"InstructionsState is invalid.");
1401 return HasCopyables;
1405std::pair<Instruction *, SmallVector<Value *>>
1407 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1408 assert(SelectedOp &&
"Cannot convert the instruction.");
1409 if (
I->isBinaryOp()) {
1411 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1430 for (
Value *V : VL) {
1435 if (Inst->getOpcode() == Opcode)
1449 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1460 "Assessing comparisons of different types?");
1470 return (BasePred == Pred &&
1472 (BasePred == SwappedPred &&
1483 return InstructionsState::invalid();
1487 return InstructionsState::invalid();
1492 (VL.
size() == 2 && InstCnt < 2))
1493 return InstructionsState::invalid();
1502 unsigned AltOpcode = Opcode;
1504 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1505 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1507 UniquePreds.
insert(BasePred);
1508 UniqueNonSwappedPreds.
insert(BasePred);
1509 for (
Value *V : VL) {
1516 UniqueNonSwappedPreds.
insert(CurrentPred);
1517 if (!UniquePreds.
contains(CurrentPred) &&
1518 !UniquePreds.
contains(SwappedCurrentPred))
1519 UniquePreds.
insert(CurrentPred);
1524 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1534 return InstructionsState::invalid();
1536 bool AnyPoison = InstCnt != VL.
size();
1547 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1548 return InstructionsState::invalid();
1549 unsigned InstOpcode =
I->getOpcode();
1551 if (BinOpHelper.add(
I))
1556 Value *Op1 =
I->getOperand(0);
1559 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1561 if (Opcode == AltOpcode) {
1562 assert(isValidForAlternation(Opcode) &&
1563 isValidForAlternation(InstOpcode) &&
1564 "Cast isn't safe for alternation, logic needs to be updated!");
1565 AltOpcode = InstOpcode;
1572 Type *Ty0 = BaseInst->getOperand(0)->getType();
1573 Type *Ty1 = Inst->getOperand(0)->getType();
1575 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1576 assert(InstOpcode == AltOpcode &&
1577 "Alternate instructions are only supported by BinaryOperator "
1585 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1586 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1592 if (MainOp != AltOp) {
1595 }
else if (BasePred != CurrentPred) {
1597 isValidForAlternation(InstOpcode) &&
1598 "CmpInst isn't safe for alternation, logic needs to be updated!");
1603 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1604 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1607 }
else if (InstOpcode == Opcode) {
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator and "
1612 if (Gep->getNumOperands() != 2 ||
1614 return InstructionsState::invalid();
1617 return InstructionsState::invalid();
1620 if (!LI->isSimple() || !BaseLI->isSimple())
1621 return InstructionsState::invalid();
1625 return InstructionsState::invalid();
1626 if (
Call->hasOperandBundles() &&
1628 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1629 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1632 return InstructionsState::invalid();
1635 return InstructionsState::invalid();
1638 if (Mappings.
size() != BaseMappings.
size() ||
1639 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1640 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1641 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1642 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1643 Mappings.
front().Shape.Parameters !=
1644 BaseMappings.
front().Shape.Parameters)
1645 return InstructionsState::invalid();
1650 return InstructionsState::invalid();
1655 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1657 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1660 "Incorrect implementation of allSameOpcode.");
1661 InstructionsState S(MainOp, AltOp);
1667 "Invalid InstructionsState.");
1675 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1685 unsigned Opcode = UserInst->
getOpcode();
1687 case Instruction::Load: {
1691 case Instruction::Store: {
1693 return (
SI->getPointerOperand() == Scalar);
1695 case Instruction::Call: {
1699 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1700 Arg.value().get() == Scalar;
1720 return LI->isSimple();
1722 return SI->isSimple();
1724 return !
MI->isVolatile();
1732 bool ExtendingManyInputs =
false) {
1733 if (SubMask.
empty())
1736 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1739 "SubMask with many inputs support must be larger than the mask.");
1741 Mask.append(SubMask.
begin(), SubMask.
end());
1745 int TermValue = std::min(Mask.size(), SubMask.
size());
1746 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1748 (!ExtendingManyInputs &&
1749 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1751 NewMask[
I] = Mask[SubMask[
I]];
1767 const size_t Sz = Order.
size();
1770 for (
unsigned I = 0;
I < Sz; ++
I) {
1772 UnusedIndices.
reset(Order[
I]);
1774 MaskedIndices.
set(
I);
1776 if (MaskedIndices.
none())
1779 "Non-synced masked/available indices.");
1783 assert(Idx >= 0 &&
"Indices must be synced.");
1793 unsigned Opcode0,
unsigned Opcode1) {
1800 OpcodeMask.
set(Lane * ScalarTyNumElements,
1801 Lane * ScalarTyNumElements + ScalarTyNumElements);
1810 "Expected scalar constants.");
1813 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1822 const unsigned E = Indices.
size();
1824 for (
unsigned I = 0;
I < E; ++
I)
1825 Mask[Indices[
I]] =
I;
1831 assert(!Mask.empty() &&
"Expected non-empty mask.");
1835 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1837 Scalars[Mask[
I]] = Prev[
I];
1850 auto *IO = dyn_cast<Instruction>(V);
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1866 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1868 auto *IU = dyn_cast<Instruction>(U);
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1887 return !VL.
empty() &&
1903 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1928 class ScheduleEntity;
1930 class ScheduleCopyableData;
1931 class ScheduleBundle;
1941 struct StridedPtrInfo {
1942 Value *StrideVal =
nullptr;
1943 const SCEV *StrideSCEV =
nullptr;
1969 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1970 AC(AC), DB(DB), DL(DL), ORE(ORE),
1989 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2002 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2023 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2024 return VectorizableTree.front()->Scalars;
2030 const TreeEntry &Root = *VectorizableTree.front();
2031 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2032 !Root.Scalars.
front()->getType()->isIntegerTy())
2033 return std::nullopt;
2034 auto It = MinBWs.find(&Root);
2035 if (It != MinBWs.end())
2039 if (Root.getOpcode() == Instruction::ZExt ||
2040 Root.getOpcode() == Instruction::SExt)
2041 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2042 Root.getOpcode() == Instruction::SExt);
2043 return std::nullopt;
2049 return MinBWs.at(VectorizableTree.front().get()).second;
2054 if (ReductionBitWidth == 0 ||
2055 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2056 ReductionBitWidth >=
2057 DL->getTypeSizeInBits(
2058 VectorizableTree.front()->Scalars.front()->getType()))
2060 VectorizableTree.front()->Scalars.front()->getType(),
2061 VectorizableTree.front()->getVectorFactor());
2064 VectorizableTree.front()->Scalars.front()->getContext(),
2066 VectorizableTree.front()->getVectorFactor());
2081 VectorizableTree.clear();
2082 ScalarToTreeEntries.clear();
2083 OperandsToTreeEntry.clear();
2084 ScalarsInSplitNodes.clear();
2086 NonScheduledFirst.clear();
2087 EntryToLastInstruction.clear();
2088 LoadEntriesToVectorize.clear();
2089 IsGraphTransformMode =
false;
2090 GatheredLoadsEntriesFirst.reset();
2091 CompressEntryToData.clear();
2092 ExternalUses.clear();
2093 ExternalUsesAsOriginalScalar.clear();
2094 ExternalUsesWithNonUsers.clear();
2095 for (
auto &Iter : BlocksSchedules) {
2096 BlockScheduling *BS = Iter.second.get();
2100 ReductionBitWidth = 0;
2102 CastMaxMinBWSizes.reset();
2103 ExtraBitWidthNodes.clear();
2104 InstrElementSize.clear();
2105 UserIgnoreList =
nullptr;
2106 PostponedGathers.clear();
2107 ValueToGatherNodes.clear();
2108 TreeEntryToStridedPtrInfoMap.clear();
2124 assert(!Order.
empty() &&
"expected non-empty order");
2125 const unsigned Sz = Order.
size();
2127 return P.value() ==
P.index() ||
P.value() == Sz;
2140 bool IgnoreReorder);
2153 std::optional<OrdersType>
2191 return MaxVecRegSize;
2196 return MinVecRegSize;
2204 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2205 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2206 return MaxVF ? MaxVF : UINT_MAX;
2245 Align Alignment,
const int64_t Diff,
Value *Ptr0,
2246 Value *PtrN, StridedPtrInfo &SPtrInfo)
const;
2261 StridedPtrInfo &SPtrInfo,
2262 unsigned *BestVF =
nullptr,
2263 bool TryRecursiveCheck =
true)
const;
2267 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2271 template <
typename T>
2273 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2298 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2299 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2324 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2325 MaxLevel(MaxLevel) {}
2381 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2386 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2388 return U == U1 || U == U2 || R.isVectorized(U);
2391 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2394 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2396 ((
int)V1->getNumUses() == NumLanes ||
2397 AllUsersAreInternal(V1, V2)))
2403 auto CheckSameEntryOrFail = [&]() {
2408 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2417 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2419 return CheckSameEntryOrFail();
2422 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2423 LI2->getPointerOperand(), DL, SE,
true);
2424 if (!Dist || *Dist == 0) {
2427 R.TTI->isLegalMaskedGather(
2430 return CheckSameEntryOrFail();
2434 if (std::abs(*Dist) > NumLanes / 2)
2467 Value *EV2 =
nullptr;
2480 int Dist = Idx2 - Idx1;
2483 if (std::abs(Dist) == 0)
2485 if (std::abs(Dist) > NumLanes / 2)
2492 return CheckSameEntryOrFail();
2498 if (I1->getParent() != I2->getParent())
2499 return CheckSameEntryOrFail();
2507 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2508 !S.isAltShuffle()) &&
2512 S.getMainOp()->getNumOperands();
2524 return CheckSameEntryOrFail();
2558 int ShallowScoreAtThisLevel =
2569 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2572 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2574 ShallowScoreAtThisLevel))
2575 return ShallowScoreAtThisLevel;
2576 assert(I1 && I2 &&
"Should have early exited.");
2583 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2584 OpIdx1 != NumOperands1; ++OpIdx1) {
2586 int MaxTmpScore = 0;
2587 unsigned MaxOpIdx2 = 0;
2588 bool FoundBest =
false;
2592 ? I2->getNumOperands()
2593 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2594 assert(FromIdx <= ToIdx &&
"Bad index");
2595 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2597 if (Op2Used.
count(OpIdx2))
2602 I1, I2, CurrLevel + 1, {});
2605 TmpScore > MaxTmpScore) {
2606 MaxTmpScore = TmpScore;
2613 Op2Used.
insert(MaxOpIdx2);
2614 ShallowScoreAtThisLevel += MaxTmpScore;
2617 return ShallowScoreAtThisLevel;
2648 struct OperandData {
2649 OperandData() =
default;
2650 OperandData(
Value *V,
bool APO,
bool IsUsed)
2651 : V(V), APO(APO), IsUsed(IsUsed) {}
2661 bool IsUsed =
false;
2670 enum class ReorderingMode {
2684 unsigned ArgSize = 0;
2690 const Loop *L =
nullptr;
2693 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2694 return OpsVec[
OpIdx][Lane];
2698 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2699 return OpsVec[
OpIdx][Lane];
2704 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2706 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2708 OpsVec[
OpIdx][Lane].IsUsed =
false;
2712 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2713 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2725 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2727 Value *IdxLaneV = getData(Idx, Lane).V;
2740 unsigned UniquesCount = Uniques.
size();
2741 auto IdxIt = Uniques.
find(IdxLaneV);
2742 unsigned UniquesCntWithIdxLaneV =
2743 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2745 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2746 unsigned UniquesCntWithOpIdxLaneV =
2747 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2748 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2750 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2751 UniquesCntWithOpIdxLaneV,
2752 UniquesCntWithOpIdxLaneV -
2754 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2755 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2756 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2765 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2766 Value *IdxLaneV = getData(Idx, Lane).V;
2779 return R.areAllUsersVectorized(IdxLaneI)
2787 static const int ScoreScaleFactor = 10;
2795 int Lane,
unsigned OpIdx,
unsigned Idx,
2805 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2806 if (Score <= -SplatScore) {
2810 Score += SplatScore;
2816 Score *= ScoreScaleFactor;
2817 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2835 std::optional<unsigned>
2836 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2840 unsigned NumOperands = getNumOperands();
2843 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2846 ReorderingMode RMode = ReorderingModes[
OpIdx];
2847 if (RMode == ReorderingMode::Failed)
2848 return std::nullopt;
2851 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2857 std::optional<unsigned> Idx;
2861 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2867 bool IsUsed = RMode == ReorderingMode::Splat ||
2868 RMode == ReorderingMode::Constant ||
2869 RMode == ReorderingMode::Load;
2871 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2873 OperandData &OpData = getData(Idx, Lane);
2875 bool OpAPO = OpData.APO;
2884 if (OpAPO != OpIdxAPO)
2889 case ReorderingMode::Load:
2890 case ReorderingMode::Opcode: {
2891 bool LeftToRight = Lane > LastLane;
2892 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2893 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2894 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2895 OpIdx, Idx, IsUsed, UsedLanes);
2896 if (Score >
static_cast<int>(BestOp.Score) ||
2897 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2900 BestOp.Score = Score;
2901 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2905 case ReorderingMode::Constant:
2907 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2911 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2918 case ReorderingMode::Splat:
2920 IsUsed =
Op == OpLastLane;
2921 if (
Op == OpLastLane) {
2923 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2929 case ReorderingMode::Failed:
2935 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2939 return std::nullopt;
2946 unsigned getBestLaneToStartReordering()
const {
2947 unsigned Min = UINT_MAX;
2948 unsigned SameOpNumber = 0;
2959 for (
int I = getNumLanes();
I > 0; --
I) {
2960 unsigned Lane =
I - 1;
2961 OperandsOrderData NumFreeOpsHash =
2962 getMaxNumOperandsThatCanBeReordered(Lane);
2965 if (NumFreeOpsHash.NumOfAPOs < Min) {
2966 Min = NumFreeOpsHash.NumOfAPOs;
2967 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2969 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2970 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2971 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2974 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2975 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2976 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2977 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2978 auto [It, Inserted] =
2979 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2985 unsigned BestLane = 0;
2986 unsigned CntMin = UINT_MAX;
2988 if (
Data.second.first < CntMin) {
2989 CntMin =
Data.second.first;
2990 BestLane =
Data.second.second;
2997 struct OperandsOrderData {
3000 unsigned NumOfAPOs = UINT_MAX;
3003 unsigned NumOpsWithSameOpcodeParent = 0;
3017 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3018 unsigned CntTrue = 0;
3019 unsigned NumOperands = getNumOperands();
3029 bool AllUndefs =
true;
3030 unsigned NumOpsWithSameOpcodeParent = 0;
3035 const OperandData &OpData = getData(
OpIdx, Lane);
3042 I->getParent() != Parent) {
3043 if (NumOpsWithSameOpcodeParent == 0) {
3044 NumOpsWithSameOpcodeParent = 1;
3046 Parent =
I->getParent();
3048 --NumOpsWithSameOpcodeParent;
3051 ++NumOpsWithSameOpcodeParent;
3060 OperandsOrderData
Data;
3061 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3062 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3069 const InstructionsState &S) {
3073 return VL.
size() == getNumLanes();
3075 "Expected same number of lanes");
3076 assert(S.valid() &&
"InstructionsState is invalid.");
3082 OpsVec.resize(ArgSize);
3083 unsigned NumLanes = VL.
size();
3084 for (OperandDataVec &
Ops : OpsVec)
3085 Ops.resize(NumLanes);
3103 bool IsInverseOperation =
false;
3104 if (S.isCopyableElement(VL[Lane])) {
3108 assert(
I &&
"Expected instruction");
3109 auto [SelectedOp,
Ops] = convertTo(
I, S);
3116 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3123 unsigned getNumOperands()
const {
return ArgSize; }
3126 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3129 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3130 return getData(
OpIdx, Lane).V;
3134 bool empty()
const {
return OpsVec.empty(); }
3137 void clear() { OpsVec.clear(); }
3142 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3144 "Op is expected to be getValue(OpIdx, Lane).");
3148 bool OpAPO = getData(
OpIdx, Lane).APO;
3149 bool IsInvariant = L && L->isLoopInvariant(
Op);
3151 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3155 bool FoundCandidate =
false;
3156 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3157 OperandData &
Data = getData(OpI, Ln);
3158 if (
Data.APO != OpAPO ||
Data.IsUsed)
3160 Value *OpILane = getValue(OpI, Lane);
3184 L->isLoopInvariant(
Data.V))) {
3185 FoundCandidate =
true;
3192 if (!FoundCandidate)
3195 return getNumLanes() == 2 || Cnt > 1;
3202 "Op is expected to be getValue(OpIdx, Lane).");
3203 bool OpAPO = getData(
OpIdx, Lane).APO;
3204 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3208 const OperandData &
Data = getData(OpI, Ln);
3209 if (
Data.APO != OpAPO ||
Data.IsUsed)
3211 Value *OpILn = getValue(OpI, Ln);
3212 return (L && L->isLoopInvariant(OpILn)) ||
3224 const InstructionsState &S,
const BoUpSLP &R)
3225 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3226 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3228 appendOperands(RootVL,
Operands, S);
3236 "Expected same num of lanes across all operands");
3237 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3238 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3246 unsigned NumOperands = getNumOperands();
3247 unsigned NumLanes = getNumLanes();
3267 unsigned FirstLane = getBestLaneToStartReordering();
3276 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3277 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3278 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3280 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3282 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3284 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3287 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3297 auto &&SkipReordering = [
this]() {
3300 for (
const OperandData &
Data : Op0)
3303 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3304 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3311 return UniqueValues.
size() != 2 &&
3313 UniqueValues.
size());
3325 if (SkipReordering())
3328 bool StrategyFailed =
false;
3336 for (
unsigned I = 0;
I < NumOperands; ++
I)
3337 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3340 UsedLanes.
set(FirstLane);
3341 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3343 for (
int Direction : {+1, -1}) {
3344 int Lane = FirstLane + Direction * Distance;
3345 if (Lane < 0 || Lane >= (
int)NumLanes)
3347 UsedLanes.
set(Lane);
3348 int LastLane = Lane - Direction;
3349 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3354 std::optional<unsigned> BestIdx =
3355 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3356 MainAltOps[
OpIdx], UsedLanes);
3363 swap(
OpIdx, *BestIdx, Lane);
3366 StrategyFailed =
true;
3370 OperandData &AltOp = getData(
OpIdx, Lane);
3371 InstructionsState OpS =
3373 if (OpS && OpS.isAltShuffle())
3380 if (!StrategyFailed)
3385#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3388 case ReorderingMode::Load:
3390 case ReorderingMode::Opcode:
3392 case ReorderingMode::Constant:
3394 case ReorderingMode::Splat:
3396 case ReorderingMode::Failed:
3417 const unsigned Indent = 2;
3419 for (
const OperandDataVec &OpDataVec : OpsVec) {
3420 OS <<
"Operand " << Cnt++ <<
"\n";
3421 for (
const OperandData &OpData : OpDataVec) {
3422 OS.
indent(Indent) <<
"{";
3423 if (
Value *V = OpData.V)
3427 OS <<
", APO:" << OpData.APO <<
"}\n";
3449 int BestScore = Limit;
3450 std::optional<int> Index;
3451 for (
int I :
seq<int>(0, Candidates.size())) {
3453 Candidates[
I].second,
3456 if (Score > BestScore) {
3471 DeletedInstructions.insert(
I);
3476 template <
typename T>
3479 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3481 for (
T *V : DeadVals) {
3486 for (
T *V : DeadVals) {
3487 if (!V || !Processed.
insert(V).second)
3492 for (
Use &U :
I->operands()) {
3494 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3496 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3497 return Entry->VectorizedValue == OpI;
3501 I->dropAllReferences();
3503 for (
T *V : DeadVals) {
3505 if (!
I->getParent())
3510 cast<Instruction>(U.getUser()));
3512 "trying to erase instruction with users.");
3513 I->removeFromParent();
3517 while (!DeadInsts.
empty()) {
3520 if (!VI || !VI->getParent())
3523 "Live instruction found in dead worklist!");
3524 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3531 for (
Use &OpU : VI->operands()) {
3532 Value *OpV = OpU.get();
3544 if (!DeletedInstructions.contains(OpI) &&
3545 (!OpI->getType()->isVectorTy() ||
3546 none_of(VectorValuesAndScales,
3547 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3548 return std::get<0>(V) == OpI;
3554 VI->removeFromParent();
3556 SE->forgetValue(VI);
3563 return AnalyzedReductionsRoots.count(
I);
3568 AnalyzedReductionsRoots.insert(
I);
3573 return AnalyzedReductionVals.contains(
hash_value(VL));
3578 AnalyzedReductionVals.insert(
hash_value(VL));
3582 AnalyzedReductionsRoots.clear();
3583 AnalyzedReductionVals.clear();
3584 AnalyzedMinBWVals.clear();
3592 return MustGather.contains(V);
3596 return NonScheduledFirst.contains(V);
3601 assert(V &&
"V cannot be nullptr.");
3602 return ScalarToTreeEntries.contains(V);
3612 bool collectValuesToDemote(
3613 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3616 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3625 void buildReorderableOperands(
3633 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3636 bool areAllUsersVectorized(
3645 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3646 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3647 return const_cast<TreeEntry *
>(
3648 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3654 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3658 getCastContextHint(
const TreeEntry &TE)
const;
3672 const InstructionsState &LocalState,
3679 unsigned InterleaveFactor = 0);
3690 bool ResizeAllowed =
false)
const;
3697 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx);
3702 template <
typename BVTy,
typename ResTy,
typename... Args>
3703 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3708 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
3714 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3721 std::optional<TargetTransformInfo::ShuffleKind>
3733 unsigned NumParts)
const;
3745 std::optional<TargetTransformInfo::ShuffleKind>
3746 isGatherShuffledSingleRegisterEntry(
3763 isGatherShuffledEntry(
3766 unsigned NumParts,
bool ForOrder =
false);
3772 Type *ScalarTy)
const;
3776 void setInsertPointAfterBundle(
const TreeEntry *E);
3786 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3791 void tryToVectorizeGatheredLoads(
3793 std::tuple<BasicBlock *, Value *, Type *>,
3801 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3817 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3821 void reorderGatherNode(TreeEntry &TE);
3826 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3830 if (State == TreeEntry::SplitVectorize)
3832 SmallVector<int>
Mask;
3839 SmallVector<int> getSplitMask()
const {
3840 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3841 "Expected only split vectorize node.");
3843 unsigned CommonVF = std::max<unsigned>(
3844 CombinedEntriesWithIndices.back().second,
3845 Scalars.size() - CombinedEntriesWithIndices.back().second);
3846 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3848 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3849 ? CommonVF - CombinedEntriesWithIndices.back().second
3856 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3857 ArrayRef<int> MaskOrder);
3862 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3863 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3866 [Scalars](
Value *V,
int Idx) {
3867 return (isa<UndefValue>(V) &&
3868 Idx == PoisonMaskElem) ||
3869 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3872 if (!ReorderIndices.empty()) {
3876 SmallVector<int>
Mask;
3878 if (VL.
size() == Scalars.size())
3879 return IsSame(Scalars, Mask);
3880 if (VL.
size() == ReuseShuffleIndices.size()) {
3882 return IsSame(Scalars, Mask);
3886 return IsSame(Scalars, ReuseShuffleIndices);
3890 bool hasEqualOperands(
const TreeEntry &TE)
const {
3891 if (
TE.getNumOperands() != getNumOperands())
3893 SmallBitVector
Used(getNumOperands());
3894 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3895 unsigned PrevCount =
Used.count();
3896 for (
unsigned K = 0;
K <
E; ++
K) {
3899 if (getOperand(K) ==
TE.getOperand(
I)) {
3905 if (PrevCount ==
Used.count())
3914 unsigned getVectorFactor()
const {
3915 if (!ReuseShuffleIndices.empty())
3916 return ReuseShuffleIndices.size();
3917 return Scalars.size();
3921 bool isGather()
const {
return State == NeedToGather; }
3927 WeakTrackingVH VectorizedValue =
nullptr;
3948 enum CombinedOpcode {
3950 MinMax = Instruction::OtherOpsEnd + 1,
3953 CombinedOpcode CombinedOp = NotCombinedOp;
3956 SmallVector<int, 4> ReuseShuffleIndices;
3959 SmallVector<unsigned, 4> ReorderIndices;
3967 VecTreeTy &Container;
3970 EdgeInfo UserTreeIndex;
3986 SmallPtrSet<const Value *, 4> CopyableElements;
3990 InstructionsState S = InstructionsState::invalid();
3993 unsigned InterleaveFactor = 0;
3996 bool DoesNotNeedToSchedule =
false;
4000 if (Operands.size() <
OpIdx + 1)
4001 Operands.resize(
OpIdx + 1);
4004 "Number of operands is greater than the number of scalars.");
4011 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4013 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4016 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4019 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4024 setOperand(
I, Operands[
I]);
4028 void reorderOperands(ArrayRef<int> Mask) {
4036 return Operands[
OpIdx];
4042 return Operands[
OpIdx];
4046 unsigned getNumOperands()
const {
return Operands.size(); }
4049 Value *getSingleOperand(
unsigned OpIdx)
const {
4052 return Operands[
OpIdx][0];
4056 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4058 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4059 return S.getMatchingMainOpOrAltOp(
I);
4067 if (
I && getMatchingMainOpOrAltOp(
I))
4069 return S.getMainOp();
4072 void setOperations(
const InstructionsState &S) {
4073 assert(S &&
"InstructionsState is invalid.");
4077 Instruction *getMainOp()
const {
return S.getMainOp(); }
4079 Instruction *getAltOp()
const {
return S.getAltOp(); }
4082 unsigned getOpcode()
const {
return S.getOpcode(); }
4084 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4086 bool hasState()
const {
return S.valid(); }
4089 void addCopyableElement(
Value *V) {
4090 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4091 CopyableElements.insert(V);
4095 bool isCopyableElement(
Value *V)
const {
4096 return CopyableElements.contains(V);
4100 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4103 const InstructionsState &getOperations()
const {
return S; }
4107 unsigned findLaneForValue(
Value *V)
const {
4108 unsigned FoundLane = getVectorFactor();
4109 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4110 std::advance(It, 1)) {
4113 FoundLane = std::distance(Scalars.begin(), It);
4114 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4115 if (!ReorderIndices.empty())
4116 FoundLane = ReorderIndices[FoundLane];
4117 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4118 if (ReuseShuffleIndices.empty())
4120 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4121 RIt != ReuseShuffleIndices.end()) {
4122 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4126 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4133 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4134 SmallVectorImpl<int> &Mask,
4135 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4136 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4139 bool isNonPowOf2Vec()
const {
4141 return IsNonPowerOf2;
4147 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4150 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4151 "Reshuffling not supported with non-power-of-2 vectors yet.");
4152 return IsNonPowerOf2;
4155 Value *getOrdered(
unsigned Idx)
const {
4156 assert(isGather() &&
"Must be used only for buildvectors/gathers.");
4157 if (ReorderIndices.empty())
4158 return Scalars[Idx];
4159 SmallVector<int>
Mask;
4161 return Scalars[
Mask[Idx]];
4167 dbgs() << Idx <<
".\n";
4168 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4169 dbgs() <<
"Operand " << OpI <<
":\n";
4170 for (
const Value *V : Operands[OpI])
4173 dbgs() <<
"Scalars: \n";
4174 for (
Value *V : Scalars)
4176 dbgs() <<
"State: ";
4177 if (S && hasCopyableElements())
4178 dbgs() <<
"[[Copyable]] ";
4181 if (InterleaveFactor > 0) {
4182 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4185 dbgs() <<
"Vectorize\n";
4188 case ScatterVectorize:
4189 dbgs() <<
"ScatterVectorize\n";
4191 case StridedVectorize:
4192 dbgs() <<
"StridedVectorize\n";
4194 case CompressVectorize:
4195 dbgs() <<
"CompressVectorize\n";
4198 dbgs() <<
"NeedToGather\n";
4200 case CombinedVectorize:
4201 dbgs() <<
"CombinedVectorize\n";
4203 case SplitVectorize:
4204 dbgs() <<
"SplitVectorize\n";
4208 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4209 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4211 dbgs() <<
"MainOp: NULL\n";
4212 dbgs() <<
"AltOp: NULL\n";
4214 dbgs() <<
"VectorizedValue: ";
4215 if (VectorizedValue)
4216 dbgs() << *VectorizedValue <<
"\n";
4219 dbgs() <<
"ReuseShuffleIndices: ";
4220 if (ReuseShuffleIndices.empty())
4223 for (
int ReuseIdx : ReuseShuffleIndices)
4224 dbgs() << ReuseIdx <<
", ";
4226 dbgs() <<
"ReorderIndices: ";
4227 for (
unsigned ReorderIdx : ReorderIndices)
4228 dbgs() << ReorderIdx <<
", ";
4230 dbgs() <<
"UserTreeIndex: ";
4232 dbgs() << UserTreeIndex;
4234 dbgs() <<
"<invalid>";
4236 if (!CombinedEntriesWithIndices.empty()) {
4237 dbgs() <<
"Combined entries: ";
4239 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4250 StringRef Banner)
const {
4251 dbgs() <<
"SLP: " << Banner <<
":\n";
4253 dbgs() <<
"SLP: Costs:\n";
4254 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4255 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4256 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4257 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4258 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4264 const InstructionsState &S,
4266 ArrayRef<int> ReuseShuffleIndices = {}) {
4267 auto Invalid = ScheduleBundle::invalid();
4268 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4273 const InstructionsState &S,
4275 ArrayRef<int> ReuseShuffleIndices = {},
4276 ArrayRef<unsigned> ReorderIndices = {},
4277 unsigned InterleaveFactor = 0) {
4278 TreeEntry::EntryState EntryState =
4279 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4280 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4281 ReuseShuffleIndices, ReorderIndices);
4282 if (
E && InterleaveFactor > 0)
4283 E->setInterleave(InterleaveFactor);
4288 TreeEntry::EntryState EntryState,
4289 ScheduleBundle &Bundle,
const InstructionsState &S,
4291 ArrayRef<int> ReuseShuffleIndices = {},
4292 ArrayRef<unsigned> ReorderIndices = {}) {
4293 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4294 EntryState == TreeEntry::SplitVectorize)) ||
4295 (Bundle && EntryState != TreeEntry::NeedToGather &&
4296 EntryState != TreeEntry::SplitVectorize)) &&
4297 "Need to vectorize gather entry?");
4299 if (GatheredLoadsEntriesFirst.has_value() &&
4300 EntryState == TreeEntry::NeedToGather && S &&
4301 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4302 !UserTreeIdx.UserTE)
4304 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4305 TreeEntry *
Last = VectorizableTree.back().get();
4306 Last->Idx = VectorizableTree.size() - 1;
4307 Last->State = EntryState;
4308 if (UserTreeIdx.UserTE)
4309 OperandsToTreeEntry.try_emplace(
4310 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4315 ReuseShuffleIndices.empty()) &&
4316 "Reshuffling scalars not yet supported for nodes with padding");
4317 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4318 ReuseShuffleIndices.end());
4319 if (ReorderIndices.
empty()) {
4322 Last->setOperations(S);
4325 Last->Scalars.assign(VL.
size(),
nullptr);
4327 [VL](
unsigned Idx) ->
Value * {
4328 if (Idx >= VL.size())
4329 return UndefValue::get(VL.front()->getType());
4334 Last->setOperations(S);
4335 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4337 if (EntryState == TreeEntry::SplitVectorize) {
4338 assert(S &&
"Split nodes must have operations.");
4339 Last->setOperations(S);
4340 SmallPtrSet<Value *, 4> Processed;
4341 for (
Value *V : VL) {
4345 auto It = ScalarsInSplitNodes.find(V);
4346 if (It == ScalarsInSplitNodes.end()) {
4347 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4348 (void)Processed.
insert(V);
4349 }
else if (Processed.
insert(V).second) {
4351 "Value already associated with the node.");
4352 It->getSecond().push_back(
Last);
4355 }
else if (!
Last->isGather()) {
4358 (!S.areInstructionsWithCopyableElements() &&
4360 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4361 Last->setDoesNotNeedToSchedule();
4362 SmallPtrSet<Value *, 4> Processed;
4363 for (
Value *V : VL) {
4366 if (S.isCopyableElement(V)) {
4367 Last->addCopyableElement(V);
4370 auto It = ScalarToTreeEntries.find(V);
4371 if (It == ScalarToTreeEntries.end()) {
4372 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4373 (void)Processed.
insert(V);
4374 }
else if (Processed.
insert(V).second) {
4376 "Value already associated with the node.");
4377 It->getSecond().push_back(
Last);
4381 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4382 "Bundle and VL out of sync");
4383 if (!Bundle.getBundle().empty()) {
4384#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4385 auto *BundleMember = Bundle.getBundle().begin();
4386 SmallPtrSet<Value *, 4> Processed;
4387 for (
Value *V : VL) {
4388 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4392 assert(BundleMember == Bundle.getBundle().end() &&
4393 "Bundle and VL out of sync");
4395 Bundle.setTreeEntry(
Last);
4399 bool AllConstsOrCasts =
true;
4400 for (
Value *V : VL) {
4401 if (S && S.areInstructionsWithCopyableElements() &&
4402 S.isCopyableElement(V))
4403 Last->addCopyableElement(V);
4406 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4407 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4408 !UserTreeIdx.UserTE->isGather())
4409 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4412 if (AllConstsOrCasts)
4414 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4415 MustGather.insert_range(VL);
4418 if (UserTreeIdx.UserTE)
4419 Last->UserTreeIndex = UserTreeIdx;
4425 TreeEntry::VecTreeTy VectorizableTree;
4430 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4431 VectorizableTree[
Id]->dump();
4439 assert(V &&
"V cannot be nullptr.");
4440 auto It = ScalarToTreeEntries.find(V);
4441 if (It == ScalarToTreeEntries.end())
4443 return It->getSecond();
4448 assert(V &&
"V cannot be nullptr.");
4449 auto It = ScalarsInSplitNodes.find(V);
4450 if (It == ScalarsInSplitNodes.end())
4452 return It->getSecond();
4457 bool SameVF =
false)
const {
4458 assert(V &&
"V cannot be nullptr.");
4459 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4460 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4471 bool areAltOperandsProfitable(
const InstructionsState &S,
4476 class ScalarsVectorizationLegality {
4477 InstructionsState S;
4479 bool TryToFindDuplicates;
4480 bool TrySplitVectorize;
4483 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4484 bool TryToFindDuplicates =
true,
4485 bool TrySplitVectorize =
false)
4486 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4487 TrySplitVectorize(TrySplitVectorize) {
4488 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4489 "Inconsistent state");
4491 const InstructionsState &getInstructionsState()
const {
return S; };
4492 bool isLegal()
const {
return IsLegal; }
4493 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4494 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4499 ScalarsVectorizationLegality
4502 bool TryCopyableElementsVectorization)
const;
4506 TreeEntry::EntryState getScalarsVectorizationState(
4508 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4509 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4512 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4515 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4516 OperandsToTreeEntry;
4519 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4522 SmallDenseMap<Value *, unsigned> InstrElementSize;
4536 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4541 SetVector<const TreeEntry *> PostponedGathers;
4543 using ValueToGatherNodesMap =
4544 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4545 ValueToGatherNodesMap ValueToGatherNodes;
4550 SetVector<unsigned> LoadEntriesToVectorize;
4553 bool IsGraphTransformMode =
false;
4556 std::optional<unsigned> GatheredLoadsEntriesFirst;
4559 SmallDenseMap<
const TreeEntry *,
4560 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4561 CompressEntryToData;
4564 struct ExternalUser {
4565 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4566 : Scalar(S), User(
U), E(E), Lane(
L) {}
4569 Value *Scalar =
nullptr;
4572 llvm::User *User =
nullptr;
4580 using UserList = SmallVector<ExternalUser, 16>;
4586 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4587 Instruction *Inst2) {
4590 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4591 auto Res = AliasCache.try_emplace(
Key);
4593 return Res.first->second;
4594 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4596 Res.first->getSecond() = Aliased;
4600 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4604 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4609 BatchAAResults BatchAA;
4616 DenseSet<Instruction *> DeletedInstructions;
4619 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4622 DenseSet<size_t> AnalyzedReductionVals;
4626 DenseSet<Value *> AnalyzedMinBWVals;
4632 UserList ExternalUses;
4636 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4640 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4643 SmallPtrSet<const Value *, 32> EphValues;
4647 SetVector<Instruction *> GatherShuffleExtractSeq;
4650 DenseSet<BasicBlock *> CSEBlocks;
4653 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4660 class ScheduleEntity {
4661 friend class ScheduleBundle;
4662 friend class ScheduleData;
4663 friend class ScheduleCopyableData;
4666 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4667 Kind getKind()
const {
return K; }
4668 ScheduleEntity(Kind K) : K(K) {}
4672 int SchedulingPriority = 0;
4675 bool IsScheduled =
false;
4677 const Kind K = Kind::ScheduleData;
4680 ScheduleEntity() =
delete;
4682 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4683 int getSchedulingPriority()
const {
return SchedulingPriority; }
4684 bool isReady()
const {
4686 return SD->isReady();
4688 return CD->isReady();
4694 bool hasValidDependencies()
const {
4696 return SD->hasValidDependencies();
4698 return CD->hasValidDependencies();
4702 int getUnscheduledDeps()
const {
4704 return SD->getUnscheduledDeps();
4706 return CD->getUnscheduledDeps();
4710 int incrementUnscheduledDeps(
int Incr) {
4712 return SD->incrementUnscheduledDeps(Incr);
4716 int getDependencies()
const {
4718 return SD->getDependencies();
4724 return SD->getInst();
4729 bool isScheduled()
const {
return IsScheduled; }
4730 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4732 static bool classof(
const ScheduleEntity *) {
return true; }
4734#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4735 void dump(raw_ostream &OS)
const {
4737 return SD->dump(OS);
4739 return CD->dump(OS);
4750#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4752 const BoUpSLP::ScheduleEntity &SE) {
4762 class ScheduleData final :
public ScheduleEntity {
4766 enum { InvalidDeps = -1 };
4768 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4769 static bool classof(
const ScheduleEntity *Entity) {
4770 return Entity->getKind() == Kind::ScheduleData;
4773 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4774 NextLoadStore =
nullptr;
4775 IsScheduled =
false;
4776 SchedulingRegionID = BlockSchedulingRegionID;
4777 clearDependencies();
4783 if (hasValidDependencies()) {
4784 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4786 assert(UnscheduledDeps == Dependencies &&
"invariant");
4790 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4791 "unexpected scheduled state");
4798 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4802 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4807 int incrementUnscheduledDeps(
int Incr) {
4808 assert(hasValidDependencies() &&
4809 "increment of unscheduled deps would be meaningless");
4810 UnscheduledDeps += Incr;
4811 return UnscheduledDeps;
4816 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4819 void clearDependencies() {
4820 clearDirectDependencies();
4821 MemoryDependencies.clear();
4822 ControlDependencies.clear();
4829 void clearDirectDependencies() {
4830 Dependencies = InvalidDeps;
4831 resetUnscheduledDeps();
4832 IsScheduled =
false;
4836 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4838 int getDependencies()
const {
return Dependencies; }
4840 void initDependencies() { Dependencies = 0; }
4842 void incDependencies() { Dependencies++; }
4845 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4852 return MemoryDependencies;
4855 void addMemoryDependency(ScheduleData *Dep) {
4856 MemoryDependencies.push_back(Dep);
4860 return ControlDependencies;
4863 void addControlDependency(ScheduleData *Dep) {
4864 ControlDependencies.push_back(Dep);
4867 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4868 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4870 void dump(raw_ostream &OS)
const { OS << *Inst; }
4882 ScheduleData *NextLoadStore =
nullptr;
4886 SmallVector<ScheduleData *> MemoryDependencies;
4892 SmallVector<ScheduleData *> ControlDependencies;
4896 int SchedulingRegionID = 0;
4902 int Dependencies = InvalidDeps;
4908 int UnscheduledDeps = InvalidDeps;
4913 const BoUpSLP::ScheduleData &SD) {
4919 class ScheduleBundle final :
public ScheduleEntity {
4923 bool IsValid =
true;
4925 TreeEntry *TE =
nullptr;
4926 ScheduleBundle(
bool IsValid)
4927 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4930 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4931 static bool classof(
const ScheduleEntity *Entity) {
4932 return Entity->getKind() == Kind::ScheduleBundle;
4937 for (
const ScheduleEntity *SD : Bundle) {
4938 if (SD->hasValidDependencies()) {
4939 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4942 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4946 if (isScheduled()) {
4947 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4948 "unexpected scheduled state");
4954 int unscheduledDepsInBundle()
const {
4955 assert(*
this &&
"bundle must not be empty");
4957 for (
const ScheduleEntity *BundleMember : Bundle) {
4958 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4959 return ScheduleData::InvalidDeps;
4960 Sum += BundleMember->getUnscheduledDeps();
4968 bool hasValidDependencies()
const {
4969 return all_of(Bundle, [](
const ScheduleEntity *SD) {
4970 return SD->hasValidDependencies();
4976 bool isReady()
const {
4977 assert(*
this &&
"bundle must not be empty");
4978 return unscheduledDepsInBundle() == 0 && !isScheduled();
4986 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
4989 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
4990 TreeEntry *getTreeEntry()
const {
return TE; }
4992 static ScheduleBundle invalid() {
return {
false}; }
4994 operator bool()
const {
return IsValid; }
4997 void dump(raw_ostream &OS)
const {
5006 OS << *SD->getInst();
5020 const BoUpSLP::ScheduleBundle &Bundle) {
5031 class ScheduleCopyableData final :
public ScheduleEntity {
5038 int SchedulingRegionID = 0;
5040 ScheduleBundle &Bundle;
5043 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5044 const EdgeInfo &EI, ScheduleBundle &Bundle)
5045 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5046 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5047 static bool classof(
const ScheduleEntity *Entity) {
5048 return Entity->getKind() == Kind::ScheduleCopyableData;
5053 if (hasValidDependencies()) {
5054 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5056 assert(UnscheduledDeps == Dependencies &&
"invariant");
5060 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5061 "unexpected scheduled state");
5068 bool hasValidDependencies()
const {
5069 return Dependencies != ScheduleData::InvalidDeps;
5074 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5079 int incrementUnscheduledDeps(
int Incr) {
5080 assert(hasValidDependencies() &&
5081 "increment of unscheduled deps would be meaningless");
5082 UnscheduledDeps += Incr;
5083 assert(UnscheduledDeps >= 0 &&
"invariant");
5084 return UnscheduledDeps;
5089 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5092 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5094 int getDependencies()
const {
return Dependencies; }
5096 void initDependencies() { Dependencies = 0; }
5098 void incDependencies() { Dependencies++; }
5101 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5107 void clearDependencies() {
5108 Dependencies = ScheduleData::InvalidDeps;
5109 UnscheduledDeps = ScheduleData::InvalidDeps;
5110 IsScheduled =
false;
5114 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5117 ScheduleBundle &getBundle() {
return Bundle; }
5118 const ScheduleBundle &getBundle()
const {
return Bundle; }
5120#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5121 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5132 int Dependencies = ScheduleData::InvalidDeps;
5138 int UnscheduledDeps = ScheduleData::InvalidDeps;
5168 struct BlockScheduling {
5170 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5173 ScheduledBundles.clear();
5174 ScheduledBundlesList.
clear();
5175 ScheduleCopyableDataMap.clear();
5176 ScheduleCopyableDataMapByInst.clear();
5177 ScheduleCopyableDataMapByInstUser.clear();
5178 ScheduleCopyableDataMapByUsers.clear();
5180 ScheduleStart =
nullptr;
5181 ScheduleEnd =
nullptr;
5182 FirstLoadStoreInRegion =
nullptr;
5183 LastLoadStoreInRegion =
nullptr;
5184 RegionHasStackSave =
false;
5188 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5191 ScheduleRegionSize = 0;
5195 ++SchedulingRegionID;
5201 if (BB !=
I->getParent())
5204 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5205 if (SD && isInSchedulingRegion(*SD))
5210 ScheduleData *getScheduleData(
Value *V) {
5216 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5217 const Value *V)
const {
5218 if (ScheduleCopyableDataMap.empty())
5220 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5221 if (It == ScheduleCopyableDataMap.end())
5223 ScheduleCopyableData *SD = It->getSecond().get();
5224 if (!isInSchedulingRegion(*SD))
5232 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5234 if (ScheduleCopyableDataMapByInstUser.empty())
5236 const auto It = ScheduleCopyableDataMapByInstUser.find(
5237 std::make_pair(std::make_pair(User, OperandIdx), V));
5238 if (It == ScheduleCopyableDataMapByInstUser.end())
5241 for (ScheduleCopyableData *SD : It->getSecond()) {
5242 if (isInSchedulingRegion(*SD))
5256 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5260 if (ScheduleCopyableDataMap.empty())
5262 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5263 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5264 for (
const Use &U :
User->operands()) {
5268 if (Entries.
empty())
5272 for (TreeEntry *TE : Entries) {
5278 bool IsCommutativeUser =
5281 EdgeInfo EI(TE,
U.getOperandNo());
5284 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5285 if (!getScheduleCopyableData(EI,
Op) && OpCnt <
NumOps)
5291 ++PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5292 .first->getSecond();
5296 if (!PotentiallyReorderedEntriesCount.
empty()) {
5297 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5298 auto *It =
find(
P.first->Scalars, User);
5299 assert(It !=
P.first->Scalars.end() &&
5300 "User is not in the tree entry");
5301 int Lane = std::distance(
P.first->Scalars.begin(), It);
5302 assert(Lane >= 0 &&
"Lane is not found");
5304 Lane =
P.first->ReorderIndices[Lane];
5305 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5306 "Couldn't find extract lane");
5307 SmallVector<unsigned> OpIndices;
5308 for (
unsigned OpIdx :
5310 P.first->getMainOp()))) {
5311 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5312 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5316 return all_of(PotentiallyReorderedEntriesCount,
5317 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5318 return P.second ==
NumOps - 1;
5325 getScheduleCopyableData(
const Instruction *
I)
const {
5326 if (ScheduleCopyableDataMapByInst.empty())
5328 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5329 if (It == ScheduleCopyableDataMapByInst.end())
5332 for (ScheduleCopyableData *SD : It->getSecond()) {
5333 if (isInSchedulingRegion(*SD))
5340 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5341 if (ScheduleCopyableDataMapByUsers.empty())
5343 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5344 if (It == ScheduleCopyableDataMapByUsers.end())
5347 for (ScheduleCopyableData *SD : It->getSecond()) {
5348 if (isInSchedulingRegion(*SD))
5354 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5356 int SchedulingRegionID,
5357 ScheduleBundle &Bundle) {
5358 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5359 ScheduleCopyableData *CD =
5360 ScheduleCopyableDataMap
5361 .try_emplace(std::make_pair(EI,
I),
5362 std::make_unique<ScheduleCopyableData>(
5363 SchedulingRegionID,
I, EI, Bundle))
5366 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5370 assert(It !=
Op.end() &&
"Lane not set");
5371 SmallPtrSet<Instruction *, 4> Visited;
5373 int Lane = std::distance(
Op.begin(), It);
5374 assert(Lane >= 0 &&
"Lane not set");
5376 !EI.UserTE->ReorderIndices.empty())
5377 Lane = EI.UserTE->ReorderIndices[Lane];
5378 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5379 "Couldn't find extract lane");
5381 if (!Visited.
insert(In).second) {
5385 ScheduleCopyableDataMapByInstUser
5386 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5389 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5396 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5397 if (ScheduleCopyableData *UserCD =
5398 getScheduleCopyableData(UserEI, In))
5399 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5402 }
while (It !=
Op.end());
5404 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5414 auto It = ScheduledBundles.find(
I);
5415 if (It == ScheduledBundles.end())
5417 return It->getSecond();
5421 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5423 return Data->getSchedulingRegionID() == SchedulingRegionID;
5425 return CD->getSchedulingRegionID() == SchedulingRegionID;
5427 [&](
const ScheduleEntity *BundleMember) {
5428 return isInSchedulingRegion(*BundleMember);
5434 template <
typename ReadyListType>
5435 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5436 const EdgeInfo &EI, ScheduleEntity *
Data,
5437 ReadyListType &ReadyList) {
5438 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5443 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5444 if ((IsControl ||
Data->hasValidDependencies()) &&
5445 Data->incrementUnscheduledDeps(-1) == 0) {
5452 CopyableBundle.
push_back(&CD->getBundle());
5453 Bundles = CopyableBundle;
5455 Bundles = getScheduleBundles(
Data->getInst());
5457 if (!Bundles.
empty()) {
5458 for (ScheduleBundle *Bundle : Bundles) {
5459 if (Bundle->unscheduledDepsInBundle() == 0) {
5460 assert(!Bundle->isScheduled() &&
5461 "already scheduled bundle gets ready");
5462 ReadyList.insert(Bundle);
5464 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5470 "already scheduled bundle gets ready");
5472 "Expected non-copyable data");
5473 ReadyList.insert(
Data);
5480 if (!ScheduleCopyableDataMap.empty()) {
5482 getScheduleCopyableData(User,
OpIdx,
I);
5483 for (ScheduleCopyableData *CD : CopyableData)
5484 DecrUnsched(CD,
false);
5485 if (!CopyableData.empty())
5488 if (ScheduleData *OpSD = getScheduleData(
I))
5489 DecrUnsched(OpSD,
false);
5495 if (!Bundles.empty()) {
5496 auto *
In = BundleMember->getInst();
5498 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5499 unsigned TotalOpCount = 0;
5502 TotalOpCount = OperandsUses[
In] = 1;
5504 for (
const Use &U :
In->operands()) {
5507 ++Res.first->getSecond();
5514 auto DecrUnschedForInst = [&](
Instruction *
I, TreeEntry *UserTE,
5516 if (!ScheduleCopyableDataMap.empty()) {
5517 const EdgeInfo EI = {UserTE,
OpIdx};
5518 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI,
I)) {
5519 DecrUnsched(CD,
false);
5523 auto It = OperandsUses.
find(
I);
5524 assert(It != OperandsUses.
end() &&
"Operand not found");
5525 if (It->second > 0) {
5527 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5529 if (ScheduleData *OpSD = getScheduleData(
I))
5530 DecrUnsched(OpSD,
false);
5534 for (ScheduleBundle *Bundle : Bundles) {
5535 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5539 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5540 find(Bundle->getTreeEntry()->Scalars, In));
5541 assert(Lane >= 0 &&
"Lane not set");
5543 !Bundle->getTreeEntry()->ReorderIndices.empty())
5544 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5545 assert(Lane <
static_cast<int>(
5546 Bundle->getTreeEntry()->Scalars.size()) &&
5547 "Couldn't find extract lane");
5557 In->getNumOperands() ==
5558 Bundle->getTreeEntry()->getNumOperands() ||
5559 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5560 "Missed TreeEntry operands?");
5562 for (
unsigned OpIdx :
5565 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5568 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx);
5574 for (Use &U : BundleMember->getInst()->operands()) {
5577 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5578 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5586 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5587 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5588 if (!VisitedMemory.
insert(MemoryDep).second)
5593 << *MemoryDep <<
"\n");
5594 DecrUnsched(MemoryDep);
5597 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5598 for (ScheduleData *Dep : SD->getControlDependencies()) {
5599 if (!VisitedControl.
insert(Dep).second)
5604 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5605 DecrUnsched(Dep,
true);
5609 SD->setScheduled(
true);
5614 if (
R.isVectorized(In)) {
5616 for (TreeEntry *TE : Entries) {
5618 In->getNumOperands() !=
TE->getNumOperands())
5621 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5622 BundlePtr->setTreeEntry(TE);
5627 ProcessBundleMember(SD, Bundles);
5630 Bundle.setScheduled(
true);
5632 auto AreAllBundlesScheduled =
5633 [&](
const ScheduleEntity *SD,
5637 return !SDBundles.empty() &&
5638 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5639 return SDBundle->isScheduled();
5642 for (ScheduleEntity *SD : Bundle.getBundle()) {
5645 SDBundles = getScheduleBundles(SD->getInst());
5646 if (AreAllBundlesScheduled(SD, SDBundles)) {
5647 SD->setScheduled(
true);
5660 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5661 ScheduleStart->comesBefore(ScheduleEnd) &&
5662 "Not a valid scheduling region?");
5664 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5666 if (!Bundles.
empty()) {
5667 for (ScheduleBundle *Bundle : Bundles) {
5668 assert(isInSchedulingRegion(*Bundle) &&
5669 "primary schedule data not in window?");
5674 auto *SD = getScheduleData(
I);
5677 assert(isInSchedulingRegion(*SD) &&
5678 "primary schedule data not in window?");
5683 [](
const ScheduleEntity *Bundle) {
5684 return Bundle->isReady();
5686 "item in ready list not ready?");
5690 template <
typename ReadyListType>
5691 void initialFillReadyList(ReadyListType &ReadyList) {
5692 SmallPtrSet<ScheduleBundle *, 16> Visited;
5693 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5694 ScheduleData *SD = getScheduleData(
I);
5695 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5698 for (ScheduleBundle *Bundle : Bundles) {
5699 if (!Visited.
insert(Bundle).second)
5701 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5702 ReadyList.insert(Bundle);
5704 << *Bundle <<
"\n");
5709 ReadyList.insert(SD);
5711 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5722 const InstructionsState &S,
const EdgeInfo &EI);
5729 std::optional<ScheduleBundle *>
5731 const InstructionsState &S,
const EdgeInfo &EI);
5734 ScheduleData *allocateScheduleDataChunks();
5738 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5742 void initScheduleData(Instruction *FromI, Instruction *ToI,
5743 ScheduleData *PrevLoadStore,
5744 ScheduleData *NextLoadStore);
5748 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5753 void resetSchedule();
5770 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5774 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5775 std::unique_ptr<ScheduleCopyableData>>
5776 ScheduleCopyableDataMap;
5782 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5783 ScheduleCopyableDataMapByInst;
5789 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5791 ScheduleCopyableDataMapByInstUser;
5811 SmallSetVector<ScheduleCopyableData *, 4>>
5812 ScheduleCopyableDataMapByUsers;
5815 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5821 SetVector<ScheduleEntity *> ReadyInsts;
5831 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5835 ScheduleData *LastLoadStoreInRegion =
nullptr;
5840 bool RegionHasStackSave =
false;
5843 int ScheduleRegionSize = 0;
5852 int SchedulingRegionID = 1;
5856 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5860 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
5863 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
5867 struct OrdersTypeDenseMapInfo {
5880 static unsigned getHashValue(
const OrdersType &V) {
5891 ScalarEvolution *SE;
5892 TargetTransformInfo *TTI;
5893 TargetLibraryInfo *TLI;
5896 AssumptionCache *AC;
5898 const DataLayout *DL;
5899 OptimizationRemarkEmitter *ORE;
5901 unsigned MaxVecRegSize;
5902 unsigned MinVecRegSize;
5905 IRBuilder<TargetFolder> Builder;
5912 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
5917 unsigned ReductionBitWidth = 0;
5920 unsigned BaseGraphSize = 1;
5924 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5928 DenseSet<unsigned> ExtraBitWidthNodes;
5938 SecondInfo::getEmptyKey());
5943 SecondInfo::getTombstoneKey());
5948 SecondInfo::getHashValue(Val.
EdgeIdx));
5969 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5980 return R.VectorizableTree[0].get();
5984 return {&
N->UserTreeIndex,
N->Container};
5988 return {&
N->UserTreeIndex + 1,
N->Container};
6015 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6026 OS << Entry->Idx <<
".\n";
6029 for (
auto *V : Entry->Scalars) {
6031 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6032 return EU.Scalar == V;
6042 if (Entry->isGather())
6044 if (Entry->State == TreeEntry::ScatterVectorize ||
6045 Entry->State == TreeEntry::StridedVectorize ||
6046 Entry->State == TreeEntry::CompressVectorize)
6047 return "color=blue";
6056 for (
auto *
I : DeletedInstructions) {
6057 if (!
I->getParent()) {
6062 I->insertBefore(F->getEntryBlock(),
6063 F->getEntryBlock().getFirstNonPHIIt());
6065 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6068 for (
Use &U :
I->operands()) {
6070 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6074 I->dropAllReferences();
6076 for (
auto *
I : DeletedInstructions) {
6078 "trying to erase instruction with users.");
6079 I->eraseFromParent();
6085#ifdef EXPENSIVE_CHECKS
6096 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6097 "Expected non-empty mask.");
6100 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6102 Reuses[Mask[
I]] = Prev[
I];
6110 bool BottomOrder =
false) {
6111 assert(!Mask.empty() &&
"Expected non-empty mask.");
6112 unsigned Sz = Mask.size();
6115 if (Order.
empty()) {
6117 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6119 PrevOrder.
swap(Order);
6122 for (
unsigned I = 0;
I < Sz; ++
I)
6124 Order[
I] = PrevOrder[Mask[
I]];
6126 return Data.value() == Sz ||
Data.index() ==
Data.value();
6135 if (Order.
empty()) {
6137 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6147 for (
unsigned I = 0;
I < Sz; ++
I)
6149 Order[MaskOrder[
I]] =
I;
6153std::optional<BoUpSLP::OrdersType>
6155 bool TopToBottom,
bool IgnoreReorder) {
6156 assert(TE.isGather() &&
"Expected gather node only.");
6160 Type *ScalarTy = GatheredScalars.
front()->getType();
6161 size_t NumScalars = GatheredScalars.
size();
6163 return std::nullopt;
6170 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6172 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6175 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6176 return std::nullopt;
6177 OrdersType CurrentOrder(NumScalars, NumScalars);
6178 if (GatherShuffles.
size() == 1 &&
6180 Entries.
front().front()->isSame(TE.Scalars)) {
6184 return std::nullopt;
6186 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6187 TE.UserTreeIndex.UserTE)
6188 return std::nullopt;
6191 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6192 return std::nullopt;
6195 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6196 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6199 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6201 return std::nullopt;
6205 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6206 return CurrentOrder;
6210 return all_of(Mask, [&](
int I) {
6217 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6218 (Entries.
size() != 1 ||
6219 Entries.
front().front()->ReorderIndices.empty())) ||
6220 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6221 return std::nullopt;
6227 if (ShuffledSubMasks.
test(
I))
6229 const int VF = GetVF(
I);
6235 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6237 ShuffledSubMasks.
set(
I);
6241 int FirstMin = INT_MAX;
6242 int SecondVecFound =
false;
6244 int Idx = Mask[
I * PartSz + K];
6246 Value *V = GatheredScalars[
I * PartSz + K];
6248 SecondVecFound =
true;
6257 SecondVecFound =
true;
6261 FirstMin = (FirstMin / PartSz) * PartSz;
6263 if (SecondVecFound) {
6265 ShuffledSubMasks.
set(
I);
6269 int Idx = Mask[
I * PartSz + K];
6273 if (Idx >= PartSz) {
6274 SecondVecFound =
true;
6277 if (CurrentOrder[
I * PartSz + Idx] >
6278 static_cast<unsigned>(
I * PartSz + K) &&
6279 CurrentOrder[
I * PartSz + Idx] !=
6280 static_cast<unsigned>(
I * PartSz + Idx))
6281 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6284 if (SecondVecFound) {
6286 ShuffledSubMasks.
set(
I);
6292 if (!ExtractShuffles.
empty())
6293 TransformMaskToOrder(
6294 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6295 if (!ExtractShuffles[
I])
6298 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6300 int K =
I * PartSz + Idx;
6303 if (!TE.ReuseShuffleIndices.empty())
6304 K = TE.ReuseShuffleIndices[K];
6307 if (!TE.ReorderIndices.empty())
6308 K = std::distance(TE.ReorderIndices.begin(),
6309 find(TE.ReorderIndices, K));
6315 .getKnownMinValue());
6320 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6321 if (ShuffledSubMasks.
any())
6322 return std::nullopt;
6323 PartSz = NumScalars;
6326 if (!Entries.
empty())
6327 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6328 if (!GatherShuffles[
I])
6330 return std::max(Entries[
I].front()->getVectorFactor(),
6331 Entries[
I].back()->getVectorFactor());
6333 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6334 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6335 return std::nullopt;
6336 return std::move(CurrentOrder);
6341 bool CompareOpcodes =
true) {
6347 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6348 (!GEP2 || GEP2->getNumOperands() == 2) &&
6349 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6350 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6353 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6357template <
typename T>
6362 return CommonAlignment;
6368 "Order is empty. Please check it before using isReverseOrder.");
6369 unsigned Sz = Order.
size();
6371 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6382 const SCEV *PtrSCEVLowest =
nullptr;
6383 const SCEV *PtrSCEVHighest =
nullptr;
6391 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6392 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6399 PtrSCEVLowest = PtrSCEV;
6406 PtrSCEVHighest = PtrSCEV;
6414 int Size =
DL.getTypeStoreSize(ElemTy);
6415 auto TryGetStride = [&](
const SCEV *Dist,
6416 const SCEV *Multiplier) ->
const SCEV * {
6418 if (M->getOperand(0) == Multiplier)
6419 return M->getOperand(1);
6420 if (M->getOperand(1) == Multiplier)
6421 return M->getOperand(0);
6424 if (Multiplier == Dist)
6429 const SCEV *Stride =
nullptr;
6430 if (
Size != 1 || SCEVs.
size() > 2) {
6432 Stride = TryGetStride(Dist, Sz);
6440 using DistOrdPair = std::pair<int64_t, int>;
6442 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6444 bool IsConsecutive =
true;
6445 for (
const SCEV *PtrSCEV : SCEVs) {
6447 if (PtrSCEV != PtrSCEVLowest) {
6449 const SCEV *Coeff = TryGetStride(Diff, Stride);
6459 Dist = SC->getAPInt().getZExtValue();
6464 auto Res = Offsets.emplace(Dist, Cnt);
6468 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6471 if (Offsets.size() != SCEVs.
size())
6473 SortedIndices.
clear();
6474 if (!IsConsecutive) {
6478 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6479 SortedIndices[Cnt] = Pair.second;
6486static std::pair<InstructionCost, InstructionCost>
6489 Type *ScalarTy, VectorType *VecTy);
6507 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6510 Mask, NumSrcElts, NumSubElts, Index)) {
6511 if (Index + NumSubElts > NumSrcElts &&
6512 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6529 "ScalableVectorType is not supported.");
6532 "Incorrect usage.");
6537 unsigned ScalarTyNumElements = VecTy->getNumElements();
6540 if (!DemandedElts[
I])
6544 I * ScalarTyNumElements, VecTy);
6547 I * ScalarTyNumElements, VecTy);
6560 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6561 if (Opcode == Instruction::ExtractElement) {
6567 Index * VecTy->getNumElements(), VecTy);
6570 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6583 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6585 Index * ScalarTy->getNumElements(), SubTp) +
6589 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6605 auto *Begin = std::next(
Mask.begin(), Index);
6606 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6610 std::iota(
Mask.begin(),
Mask.end(), 0);
6611 std::iota(std::next(
Mask.begin(), Index),
6612 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6614 return Generator(Vec, V, Mask);
6617 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6625 unsigned SubVecVF,
unsigned Index) {
6627 std::iota(Mask.begin(), Mask.end(), Index);
6628 return Builder.CreateShuffleVector(Vec, Mask);
6638 const unsigned Sz = PointerOps.
size();
6641 CompressMask[0] = 0;
6643 std::optional<unsigned> Stride = 0;
6647 std::optional<int64_t> OptPos =
6649 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6651 unsigned Pos =
static_cast<unsigned>(*OptPos);
6652 CompressMask[
I] = Pos;
6659 if (Pos != *Stride *
I)
6662 return Stride.has_value();
6675 InterleaveFactor = 0;
6677 const size_t Sz = VL.
size();
6685 if (AreAllUsersVectorized(V))
6688 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6689 Mask.empty() ?
I : Mask[
I]);
6692 if (ExtractCost <= ScalarCost)
6697 if (Order.
empty()) {
6698 Ptr0 = PointerOps.
front();
6699 PtrN = PointerOps.
back();
6701 Ptr0 = PointerOps[Order.
front()];
6702 PtrN = PointerOps[Order.
back()];
6704 std::optional<int64_t> Diff =
6708 const size_t MaxRegSize =
6712 if (*Diff / Sz >= MaxRegSize / 8)
6716 Align CommonAlignment = LI->getAlign();
6718 Ptr0, LoadVecTy, CommonAlignment,
DL,
6721 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6722 LI->getPointerAddressSpace()))
6728 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6732 auto [ScalarGEPCost, VectorGEPCost] =
6734 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6752 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6753 LI->getPointerAddressSpace(),
CostKind);
6756 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6757 LI->getPointerAddressSpace(),
CostKind);
6759 if (IsStrided && !IsMasked && Order.
empty()) {
6766 AlignedLoadVecTy = LoadVecTy;
6767 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6769 LI->getPointerAddressSpace())) {
6771 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6772 Instruction::Load, AlignedLoadVecTy,
6773 CompressMask[1], {}, CommonAlignment,
6774 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6775 if (InterleavedCost < GatherCost) {
6776 InterleaveFactor = CompressMask[1];
6777 LoadVecTy = AlignedLoadVecTy;
6784 if (!Order.
empty()) {
6787 NewMask[
I] = CompressMask[Mask[
I]];
6789 CompressMask.
swap(NewMask);
6791 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6792 return TotalVecCost < GatherCost;
6805 unsigned InterleaveFactor;
6809 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6810 CompressMask, LoadVecTy);
6827 Align Alignment,
const int64_t Diff,
Value *Ptr0,
6828 Value *PtrN, StridedPtrInfo &SPtrInfo)
const {
6829 const size_t Sz = PointerOps.
size();
6830 if (Diff % (Sz - 1) != 0)
6834 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
6836 return !isVectorized(U) && !MustGather.contains(U);
6840 const uint64_t AbsoluteDiff = std::abs(Diff);
6842 if (IsAnyPointerUsedOutGraph ||
6843 (AbsoluteDiff > Sz &&
6846 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
6847 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
6848 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6849 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
6851 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
6861 else if (
Ptr != Ptr0)
6865 if (((Dist / Stride) * Stride) != Dist || !Dists.
insert(Dist).second)
6868 if (Dists.
size() == Sz) {
6869 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
6870 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6881 unsigned *BestVF,
bool TryRecursiveCheck)
const {
6894 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
6900 const size_t Sz = VL.
size();
6902 auto *POIter = PointerOps.
begin();
6903 for (
Value *V : VL) {
6905 if (!L || !L->isSimple())
6907 *POIter = L->getPointerOperand();
6913 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
6919 if (
const SCEV *Stride =
6921 Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
6923 SPtrInfo.StrideSCEV = Stride;
6928 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6929 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6940 if (Order.
empty()) {
6941 Ptr0 = PointerOps.
front();
6942 PtrN = PointerOps.
back();
6944 Ptr0 = PointerOps[Order.
front()];
6945 PtrN = PointerOps[Order.
back()];
6947 std::optional<int64_t> Diff =
6950 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
6953 *TLI, [&](
Value *V) {
6954 return areAllUsersVectorized(
6961 if (
isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN,
6965 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6966 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6971 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
6973 bool ProfitableGatherPointers) {
6978 auto [ScalarGEPCost, VectorGEPCost] =
6980 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
6984 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
6986 if (
static_cast<unsigned>(
count_if(
7005 return C + TTI.getInstructionCost(
7011 TTI.getGatherScatterOpCost(
7013 false, CommonAlignment,
CostKind) +
7014 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7022 constexpr unsigned ListLimit = 4;
7023 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7032 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7042 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7047 PointerOps, SPtrInfo, BestVF,
7055 DemandedElts.
setBits(Cnt, Cnt + VF);
7071 if (!DemandedElts.
isZero()) {
7077 if (DemandedElts[Idx])
7088 LI0->getPointerOperand(),
7089 Instruction::GetElementPtr,
CostKind, ScalarTy,
7093 if (
static_cast<unsigned>(
7095 PointerOps.
size() - 1 ||
7114 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7115 LI0->getPointerAddressSpace(),
CostKind,
7120 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7121 LI0->getPointerOperand(),
7127 VecLdCost += TTI.getMaskedMemoryOpCost(
7128 Instruction::Load, SubVecTy, CommonAlignment,
7129 LI0->getPointerAddressSpace(),
CostKind) +
7135 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7136 LI0->getPointerOperand(),
7147 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7156 if (MaskedGatherCost >= VecLdCost &&
7169 bool ProfitableGatherPointers =
7170 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7171 return L->isLoopInvariant(V);
7173 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7176 (
GEP &&
GEP->getNumOperands() == 2 &&
7184 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7185 ProfitableGatherPointers))
7197 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7198 "Expected list of pointer operands.");
7203 std::pair<BasicBlock *, Value *>,
7207 .try_emplace(std::make_pair(
7211 SortedIndices.
clear();
7213 auto Key = std::make_pair(BBs[Cnt + 1],
7215 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7217 std::optional<int64_t> Diff =
7218 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7219 ElemTy, Ptr, DL, SE,
7224 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7230 if (Bases.size() > VL.
size() / 2 - 1)
7234 Bases.find(
Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
7238 if (Bases.size() == VL.
size())
7241 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7242 Bases.front().second.size() == VL.
size()))
7247 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7256 FirstPointers.
insert(P1);
7257 SecondPointers.
insert(P2);
7263 "Unable to find matching root.");
7266 for (
auto &
Base : Bases) {
7267 for (
auto &Vec :
Base.second) {
7268 if (Vec.size() > 1) {
7270 int64_t InitialOffset = std::get<1>(Vec[0]);
7271 bool AnyConsecutive =
7273 return std::get<1>(
P.value()) ==
7274 int64_t(
P.index()) + InitialOffset;
7278 if (!AnyConsecutive)
7283 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7287 for (
auto &
T : Bases)
7288 for (
const auto &Vec :
T.second)
7289 for (
const auto &
P : Vec)
7293 "Expected SortedIndices to be the size of VL");
7297std::optional<BoUpSLP::OrdersType>
7299 assert(TE.isGather() &&
"Expected gather node only.");
7300 Type *ScalarTy = TE.Scalars[0]->getType();
7303 Ptrs.
reserve(TE.Scalars.size());
7305 BBs.
reserve(TE.Scalars.size());
7306 for (
Value *V : TE.Scalars) {
7308 if (!L || !L->isSimple())
7309 return std::nullopt;
7315 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7317 return std::move(Order);
7318 return std::nullopt;
7329 if (VU->
getType() != V->getType())
7332 if (!VU->
hasOneUse() && !V->hasOneUse())
7338 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7345 bool IsReusedIdx =
false;
7347 if (IE2 == VU && !IE1)
7349 if (IE1 == V && !IE2)
7350 return V->hasOneUse();
7351 if (IE1 && IE1 != V) {
7353 IsReusedIdx |= ReusedIdx.
test(Idx1);
7354 ReusedIdx.
set(Idx1);
7355 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7360 if (IE2 && IE2 != VU) {
7362 IsReusedIdx |= ReusedIdx.
test(Idx2);
7363 ReusedIdx.
set(Idx2);
7364 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7369 }
while (!IsReusedIdx && (IE1 || IE2));
7377 const TargetLibraryInfo &TLI);
7379std::optional<BoUpSLP::OrdersType>
7381 bool IgnoreReorder) {
7384 if (!TE.ReuseShuffleIndices.empty()) {
7386 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7387 "Reshuffling scalars not yet supported for nodes with padding");
7390 return std::nullopt;
7398 unsigned Sz = TE.Scalars.size();
7399 if (TE.isGather()) {
7400 if (std::optional<OrdersType> CurrentOrder =
7405 ::addMask(Mask, TE.ReuseShuffleIndices);
7406 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7407 unsigned Sz = TE.Scalars.size();
7408 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7411 Res[Idx + K * Sz] =
I + K * Sz;
7413 return std::move(Res);
7416 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7418 2 * TE.getVectorFactor())) == 1)
7419 return std::nullopt;
7420 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7421 return std::nullopt;
7425 if (TE.ReorderIndices.empty())
7426 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7429 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7430 unsigned VF = ReorderMask.
size();
7434 for (
unsigned I = 0;
I < VF;
I += Sz) {
7436 unsigned UndefCnt = 0;
7437 unsigned Limit = std::min(Sz, VF -
I);
7446 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7448 return std::nullopt;
7450 for (
unsigned K = 0; K < NumParts; ++K) {
7451 unsigned Idx = Val + Sz * K;
7452 if (Idx < VF &&
I + K < VF)
7453 ResOrder[Idx] =
I + K;
7456 return std::move(ResOrder);
7458 unsigned VF = TE.getVectorFactor();
7461 TE.ReuseShuffleIndices.end());
7462 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7464 if (isa<PoisonValue>(V))
7466 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7467 return Idx && *Idx < Sz;
7469 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7470 "by BinaryOperator and CastInst.");
7472 if (TE.ReorderIndices.empty())
7473 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7476 for (
unsigned I = 0;
I < VF; ++
I) {
7477 int &Idx = ReusedMask[
I];
7480 Value *V = TE.Scalars[ReorderMask[Idx]];
7482 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7488 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7489 auto *It = ResOrder.
begin();
7490 for (
unsigned K = 0; K < VF; K += Sz) {
7494 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7496 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7497 std::advance(It, Sz);
7500 return Data.index() ==
Data.value();
7502 return std::nullopt;
7503 return std::move(ResOrder);
7505 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7506 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7508 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7509 return std::nullopt;
7510 if (TE.State == TreeEntry::SplitVectorize ||
7511 ((TE.State == TreeEntry::Vectorize ||
7512 TE.State == TreeEntry::StridedVectorize ||
7513 TE.State == TreeEntry::CompressVectorize) &&
7516 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7517 "Alternate instructions are only supported by "
7518 "BinaryOperator and CastInst.");
7519 return TE.ReorderIndices;
7521 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7522 TE.isAltShuffle()) {
7523 assert(TE.ReuseShuffleIndices.empty() &&
7524 "ReuseShuffleIndices should be "
7525 "empty for alternate instructions.");
7527 TE.buildAltOpShuffleMask(
7529 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7530 "Unexpected main/alternate opcode");
7534 const int VF = TE.getVectorFactor();
7539 ResOrder[Mask[
I] % VF] =
I;
7541 return std::move(ResOrder);
7543 if (!TE.ReorderIndices.empty())
7544 return TE.ReorderIndices;
7545 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7546 if (!TE.ReorderIndices.empty())
7547 return TE.ReorderIndices;
7550 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7558 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7566 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7567 if (!DT->isReachableFromEntry(BB1))
7569 if (!DT->isReachableFromEntry(BB2))
7571 auto *NodeA = DT->getNode(BB1);
7572 auto *NodeB = DT->getNode(BB2);
7573 assert(NodeA &&
"Should only process reachable instructions");
7574 assert(NodeB &&
"Should only process reachable instructions");
7575 assert((NodeA == NodeB) ==
7576 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7577 "Different nodes should have different DFS numbers");
7578 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7580 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7581 Value *V1 = TE.Scalars[I1];
7582 Value *V2 = TE.Scalars[I2];
7595 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7596 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7597 FirstUserOfPhi2->getParent());
7607 if (UserBVHead[I1] && !UserBVHead[I2])
7609 if (!UserBVHead[I1])
7611 if (UserBVHead[I1] == UserBVHead[I2])
7614 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7616 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7629 if (EE1->getOperand(0) == EE2->getOperand(0))
7631 if (!Inst1 && Inst2)
7633 if (Inst1 && Inst2) {
7641 "Expected either instructions or arguments vector operands.");
7642 return P1->getArgNo() < P2->getArgNo();
7647 std::iota(Phis.
begin(), Phis.
end(), 0);
7650 return std::nullopt;
7651 return std::move(Phis);
7653 if (TE.isGather() &&
7654 (!TE.hasState() || !TE.isAltShuffle() ||
7655 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7659 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7663 auto *EE = dyn_cast<ExtractElementInst>(V);
7664 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7670 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7671 if (Reuse || !CurrentOrder.
empty())
7672 return std::move(CurrentOrder);
7680 int Sz = TE.Scalars.size();
7684 if (It == TE.Scalars.begin())
7687 if (It != TE.Scalars.end()) {
7689 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7704 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7707 return std::move(Order);
7712 return std::nullopt;
7713 if (TE.Scalars.size() >= 3)
7718 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7720 StridedPtrInfo SPtrInfo;
7723 CurrentOrder, PointerOps, SPtrInfo);
7726 return std::move(CurrentOrder);
7731 if (std::optional<OrdersType> CurrentOrder =
7733 return CurrentOrder;
7735 return std::nullopt;
7745 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7747 if (Cluster != FirstCluster)
7753void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask)
const {
7756 const unsigned Sz =
TE.Scalars.size();
7758 if (!
TE.isGather() ||
7763 SmallVector<int> NewMask;
7765 addMask(NewMask,
TE.ReuseShuffleIndices);
7767 TE.ReorderIndices.clear();
7769 ArrayRef<int> Slice =
ArrayRef(NewMask).slice(0, Sz);
7770 SmallVector<unsigned> NewOrder(Slice);
7774 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
7775 *End =
TE.ReuseShuffleIndices.end();
7776 It != End; std::advance(It, Sz))
7777 std::iota(It, std::next(It, Sz), 0);
7783 "Expected same size of orders");
7784 size_t Sz = Order.
size();
7787 if (Order[Idx] != Sz)
7788 UsedIndices.
set(Order[Idx]);
7790 if (SecondaryOrder.
empty()) {
7792 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
7796 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7797 !UsedIndices.
test(SecondaryOrder[Idx]))
7798 Order[Idx] = SecondaryOrder[Idx];
7806 constexpr unsigned TinyVF = 2;
7807 constexpr unsigned TinyTree = 10;
7808 constexpr unsigned PhiOpsLimit = 12;
7809 constexpr unsigned GatherLoadsLimit = 2;
7810 if (VectorizableTree.size() <= TinyTree)
7812 if (VectorizableTree.front()->hasState() &&
7813 !VectorizableTree.front()->isGather() &&
7814 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7815 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7816 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7817 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7818 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7819 VectorizableTree.front()->ReorderIndices.empty()) {
7823 if (VectorizableTree.front()->hasState() &&
7824 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7825 VectorizableTree.front()->Scalars.size() == TinyVF &&
7826 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7829 if (VectorizableTree.front()->hasState() &&
7830 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7831 VectorizableTree.front()->ReorderIndices.empty()) {
7832 const unsigned ReorderedSplitsCnt =
7833 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7834 return TE->State == TreeEntry::SplitVectorize &&
7835 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7836 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7839 if (ReorderedSplitsCnt <= 1 &&
7841 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7842 return ((!TE->isGather() &&
7843 (TE->ReorderIndices.empty() ||
7844 (TE->UserTreeIndex.UserTE &&
7845 TE->UserTreeIndex.UserTE->State ==
7846 TreeEntry::Vectorize &&
7847 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7849 (TE->isGather() && TE->ReorderIndices.empty() &&
7850 (!TE->hasState() || TE->isAltShuffle() ||
7851 TE->getOpcode() == Instruction::Load ||
7852 TE->getOpcode() == Instruction::ZExt ||
7853 TE->getOpcode() == Instruction::SExt))) &&
7854 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7855 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
7856 return !isConstant(V) && isVectorized(V);
7858 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7861 bool HasPhis =
false;
7862 bool HasLoad =
true;
7863 unsigned GatherLoads = 0;
7864 for (
const std::unique_ptr<TreeEntry> &TE :
7865 ArrayRef(VectorizableTree).drop_front()) {
7866 if (TE->State == TreeEntry::SplitVectorize)
7868 if (!TE->hasState()) {
7872 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7877 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7878 if (!TE->isGather()) {
7885 if (GatherLoads >= GatherLoadsLimit)
7888 if (TE->getOpcode() == Instruction::GetElementPtr ||
7891 if (TE->getOpcode() != Instruction::PHI &&
7892 (!TE->hasCopyableElements() ||
7894 TE->Scalars.size() / 2))
7896 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7897 TE->getNumOperands() > PhiOpsLimit)
7906void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
7908 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
7911 std::iota(NewMask.
begin(), NewMask.
end(), 0);
7912 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7915 copy(MaskOrder, NewMaskOrder.begin());
7917 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
7918 unsigned Offset = CombinedEntriesWithIndices.
back().second;
7927 ReorderIndices.clear();
7946 ExternalUserReorderMap;
7950 for_each(VectorizableTree, [&, &TTIRef = *TTI](
7951 const std::unique_ptr<TreeEntry> &TE) {
7954 findExternalStoreUsersReorderIndices(TE.get());
7955 if (!ExternalUserReorderIndices.
empty()) {
7956 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7958 std::move(ExternalUserReorderIndices));
7964 if (TE->hasState() && TE->isAltShuffle() &&
7965 TE->State != TreeEntry::SplitVectorize) {
7966 Type *ScalarTy = TE->Scalars[0]->getType();
7968 unsigned Opcode0 = TE->getOpcode();
7969 unsigned Opcode1 = TE->getAltOpcode();
7973 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7974 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7980 bool IgnoreReorder =
7981 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7982 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
7983 VectorizableTree.front()->getOpcode() == Instruction::Store);
7984 if (std::optional<OrdersType> CurrentOrder =
7994 const TreeEntry *UserTE = TE.get();
7996 if (!UserTE->UserTreeIndex)
7998 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7999 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8000 UserTE->UserTreeIndex.UserTE->Idx != 0)
8002 UserTE = UserTE->UserTreeIndex.UserTE;
8005 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8006 if (!(TE->State == TreeEntry::Vectorize ||
8007 TE->State == TreeEntry::StridedVectorize ||
8008 TE->State == TreeEntry::SplitVectorize ||
8009 TE->State == TreeEntry::CompressVectorize) ||
8010 !TE->ReuseShuffleIndices.empty())
8011 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8012 if (TE->State == TreeEntry::Vectorize &&
8013 TE->getOpcode() == Instruction::PHI)
8014 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8019 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8020 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8021 auto It = VFToOrderedEntries.
find(VF);
8022 if (It == VFToOrderedEntries.
end())
8036 for (
const TreeEntry *OpTE : OrderedEntries) {
8039 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8040 OpTE->State != TreeEntry::SplitVectorize)
8043 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8045 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8046 auto It = GathersToOrders.find(OpTE);
8047 if (It != GathersToOrders.end())
8050 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8051 auto It = AltShufflesToOrders.find(OpTE);
8052 if (It != AltShufflesToOrders.end())
8055 if (OpTE->State == TreeEntry::Vectorize &&
8056 OpTE->getOpcode() == Instruction::PHI) {
8057 auto It = PhisToOrders.
find(OpTE);
8058 if (It != PhisToOrders.
end())
8061 return OpTE->ReorderIndices;
8064 auto It = ExternalUserReorderMap.
find(OpTE);
8065 if (It != ExternalUserReorderMap.
end()) {
8066 const auto &ExternalUserReorderIndices = It->second;
8070 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8071 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8072 ExternalUserReorderIndices.size();
8074 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8075 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8082 if (OpTE->State == TreeEntry::Vectorize &&
8083 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8084 assert(!OpTE->isAltShuffle() &&
8085 "Alternate instructions are only supported by BinaryOperator "
8089 unsigned E = Order.
size();
8092 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8095 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8097 ++OrdersUses.try_emplace(Order, 0).first->second;
8100 if (OrdersUses.empty())
8103 unsigned IdentityCnt = 0;
8104 unsigned FilledIdentityCnt = 0;
8106 for (
auto &Pair : OrdersUses) {
8108 if (!Pair.first.empty())
8109 FilledIdentityCnt += Pair.second;
8110 IdentityCnt += Pair.second;
8115 unsigned Cnt = IdentityCnt;
8116 for (
auto &Pair : OrdersUses) {
8120 if (Cnt < Pair.second ||
8121 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8122 Cnt == Pair.second && !BestOrder.
empty() &&
8125 BestOrder = Pair.first;
8138 unsigned E = BestOrder.
size();
8140 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8143 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8145 if (TE->Scalars.size() != VF) {
8146 if (TE->ReuseShuffleIndices.size() == VF) {
8147 assert(TE->State != TreeEntry::SplitVectorize &&
8148 "Split vectorized not expected.");
8153 (!TE->UserTreeIndex ||
8154 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8155 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8156 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8157 "All users must be of VF size.");
8164 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8170 reorderNodeWithReuses(*TE, Mask);
8172 if (TE->UserTreeIndex &&
8173 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8174 TE->UserTreeIndex.UserTE->reorderSplitNode(
8175 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8179 if ((TE->State == TreeEntry::SplitVectorize &&
8180 TE->ReuseShuffleIndices.empty()) ||
8181 ((TE->State == TreeEntry::Vectorize ||
8182 TE->State == TreeEntry::StridedVectorize ||
8183 TE->State == TreeEntry::CompressVectorize) &&
8188 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8189 TE->ReuseShuffleIndices.empty())) &&
8190 "Alternate instructions are only supported by BinaryOperator "
8196 TE->reorderOperands(Mask);
8199 TE->reorderOperands(Mask);
8200 assert(TE->ReorderIndices.empty() &&
8201 "Expected empty reorder sequence.");
8204 if (!TE->ReuseShuffleIndices.empty()) {
8211 addMask(NewReuses, TE->ReuseShuffleIndices);
8212 TE->ReuseShuffleIndices.swap(NewReuses);
8213 }
else if (TE->UserTreeIndex &&
8214 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8216 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8222void BoUpSLP::buildReorderableOperands(
8223 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8227 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8228 return OpData.first ==
I &&
8229 (OpData.second->State == TreeEntry::Vectorize ||
8230 OpData.second->State == TreeEntry::StridedVectorize ||
8231 OpData.second->State == TreeEntry::CompressVectorize ||
8232 OpData.second->State == TreeEntry::SplitVectorize);
8236 if (UserTE->hasState()) {
8237 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8238 UserTE->getOpcode() == Instruction::ExtractValue)
8240 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8242 if (UserTE->getOpcode() == Instruction::Store &&
8243 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8245 if (UserTE->getOpcode() == Instruction::Load &&
8246 (UserTE->State == TreeEntry::Vectorize ||
8247 UserTE->State == TreeEntry::StridedVectorize ||
8248 UserTE->State == TreeEntry::CompressVectorize))
8251 TreeEntry *TE = getOperandEntry(UserTE,
I);
8252 assert(TE &&
"Expected operand entry.");
8253 if (!TE->isGather()) {
8256 Edges.emplace_back(
I, TE);
8262 if (TE->State == TreeEntry::ScatterVectorize &&
8263 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8267 if (ReorderableGathers.
contains(TE))
8273 struct TreeEntryCompare {
8274 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8275 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8276 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8277 return LHS->Idx < RHS->Idx;
8286 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8287 if (TE->State != TreeEntry::Vectorize &&
8288 TE->State != TreeEntry::StridedVectorize &&
8289 TE->State != TreeEntry::CompressVectorize &&
8290 TE->State != TreeEntry::SplitVectorize)
8291 NonVectorized.
insert(TE.get());
8292 if (std::optional<OrdersType> CurrentOrder =
8294 Queue.push(TE.get());
8295 if (!(TE->State == TreeEntry::Vectorize ||
8296 TE->State == TreeEntry::StridedVectorize ||
8297 TE->State == TreeEntry::CompressVectorize ||
8298 TE->State == TreeEntry::SplitVectorize) ||
8299 !TE->ReuseShuffleIndices.empty())
8300 GathersToOrders.
insert(TE.get());
8309 while (!Queue.empty()) {
8311 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8312 TreeEntry *TE = Queue.top();
8313 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8316 while (!Queue.empty()) {
8318 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8323 for (TreeEntry *TE : OrderedOps) {
8324 if (!(TE->State == TreeEntry::Vectorize ||
8325 TE->State == TreeEntry::StridedVectorize ||
8326 TE->State == TreeEntry::CompressVectorize ||
8327 TE->State == TreeEntry::SplitVectorize ||
8328 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8329 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8330 !Visited.
insert(TE).second)
8334 Users.first = TE->UserTreeIndex.UserTE;
8335 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8339 if (
Data.first->State == TreeEntry::SplitVectorize) {
8341 Data.second.size() <= 2 &&
8342 "Expected not greater than 2 operands for split vectorize node.");
8344 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8347 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8348 "Expected exactly 2 entries.");
8349 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8350 TreeEntry &OpTE = *VectorizableTree[
P.first];
8352 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8353 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8355 const auto BestOrder =
8364 const unsigned E = Order.
size();
8367 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8369 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8371 if (!OpTE.ReorderIndices.empty()) {
8372 OpTE.ReorderIndices.clear();
8373 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8376 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8380 if (
Data.first->ReuseShuffleIndices.empty() &&
8381 !
Data.first->ReorderIndices.empty()) {
8384 Queue.push(
Data.first);
8390 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8402 for (
const auto &
Op :
Data.second) {
8403 TreeEntry *OpTE =
Op.second;
8404 if (!VisitedOps.
insert(OpTE).second)
8406 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8408 const auto Order = [&]() ->
const OrdersType {
8409 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8413 return OpTE->ReorderIndices;
8417 if (Order.
size() == 1)
8423 Value *Root = OpTE->hasState()
8426 auto GetSameNodesUsers = [&](
Value *Root) {
8428 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8429 if (TE != OpTE && TE->UserTreeIndex &&
8430 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8431 TE->Scalars.size() == OpTE->Scalars.size() &&
8432 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8433 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8434 Res.
insert(TE->UserTreeIndex.UserTE);
8436 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8437 if (TE != OpTE && TE->UserTreeIndex &&
8438 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8439 TE->Scalars.size() == OpTE->Scalars.size() &&
8440 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8441 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8442 Res.
insert(TE->UserTreeIndex.UserTE);
8446 auto GetNumOperands = [](
const TreeEntry *TE) {
8447 if (TE->State == TreeEntry::SplitVectorize)
8448 return TE->getNumOperands();
8450 return CI->arg_size();
8451 return TE->getNumOperands();
8453 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8454 const TreeEntry *TE) {
8462 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8463 if (
Op->isGather() &&
Op->hasState()) {
8464 const TreeEntry *VecOp =
8465 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8469 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8476 if (!RevisitedOps.
insert(UTE).second)
8478 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8479 !UTE->ReuseShuffleIndices.empty() ||
8480 (UTE->UserTreeIndex &&
8481 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8482 (
Data.first->UserTreeIndex &&
8483 Data.first->UserTreeIndex.UserTE == UTE) ||
8484 (IgnoreReorder && UTE->UserTreeIndex &&
8485 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8486 NodeShouldBeReorderedWithOperands(UTE);
8489 for (TreeEntry *UTE :
Users) {
8497 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8499 Queue.push(
const_cast<TreeEntry *
>(
Op));
8504 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8505 return P.second == OpTE;
8508 if (OpTE->State == TreeEntry::Vectorize &&
8509 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8510 assert(!OpTE->isAltShuffle() &&
8511 "Alternate instructions are only supported by BinaryOperator "
8515 unsigned E = Order.
size();
8518 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8521 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
8523 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
8525 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8526 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8527 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8528 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8529 (IgnoreReorder && TE->Idx == 0))
8531 if (TE->isGather()) {
8541 if (OpTE->UserTreeIndex) {
8542 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8543 if (!VisitedUsers.
insert(UserTE).second)
8548 if (AllowsReordering(UserTE))
8556 if (
static_cast<unsigned>(
count_if(
8557 Ops, [UserTE, &AllowsReordering](
8558 const std::pair<unsigned, TreeEntry *> &
Op) {
8559 return AllowsReordering(
Op.second) &&
8560 Op.second->UserTreeIndex.UserTE == UserTE;
8561 })) <=
Ops.size() / 2)
8562 ++Res.first->second;
8565 if (OrdersUses.empty()) {
8570 unsigned IdentityCnt = 0;
8571 unsigned VF =
Data.second.front().second->getVectorFactor();
8573 for (
auto &Pair : OrdersUses) {
8575 IdentityCnt += Pair.second;
8580 unsigned Cnt = IdentityCnt;
8581 for (
auto &Pair : OrdersUses) {
8585 if (Cnt < Pair.second) {
8587 BestOrder = Pair.first;
8604 unsigned E = BestOrder.
size();
8606 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8608 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8609 TreeEntry *TE =
Op.second;
8610 if (!VisitedOps.
insert(TE).second)
8612 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8613 reorderNodeWithReuses(*TE, Mask);
8617 if (TE->State != TreeEntry::Vectorize &&
8618 TE->State != TreeEntry::StridedVectorize &&
8619 TE->State != TreeEntry::CompressVectorize &&
8620 TE->State != TreeEntry::SplitVectorize &&
8621 (TE->State != TreeEntry::ScatterVectorize ||
8622 TE->ReorderIndices.empty()))
8624 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8625 TE->ReorderIndices.empty()) &&
8626 "Non-matching sizes of user/operand entries.");
8628 if (IgnoreReorder && TE == VectorizableTree.front().get())
8629 IgnoreReorder =
false;
8632 for (TreeEntry *
Gather : GatherOps) {
8634 "Unexpected reordering of gathers.");
8635 if (!
Gather->ReuseShuffleIndices.empty()) {
8645 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8646 return TE.isAltShuffle() &&
8647 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8648 TE.ReorderIndices.empty());
8650 if (
Data.first->State != TreeEntry::Vectorize ||
8652 Data.first->getMainOp()) ||
8653 IsNotProfitableAltCodeNode(*
Data.first))
8654 Data.first->reorderOperands(Mask);
8656 IsNotProfitableAltCodeNode(*
Data.first) ||
8657 Data.first->State == TreeEntry::StridedVectorize ||
8658 Data.first->State == TreeEntry::CompressVectorize) {
8662 if (
Data.first->ReuseShuffleIndices.empty() &&
8663 !
Data.first->ReorderIndices.empty() &&
8664 !IsNotProfitableAltCodeNode(*
Data.first)) {
8667 Queue.push(
Data.first);
8675 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8676 VectorizableTree.front()->ReuseShuffleIndices.empty())
8677 VectorizableTree.front()->ReorderIndices.
clear();
8680Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8681 if (Entry.hasState() &&
8682 (Entry.getOpcode() == Instruction::Store ||
8683 Entry.getOpcode() == Instruction::Load) &&
8684 Entry.State == TreeEntry::StridedVectorize &&
8685 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8692 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8696 for (
auto &TEPtr : VectorizableTree) {
8697 TreeEntry *Entry = TEPtr.get();
8700 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8704 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8705 Value *Scalar = Entry->Scalars[Lane];
8710 auto It = ScalarToExtUses.
find(Scalar);
8711 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8714 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8715 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8716 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8717 <<
" from " << *Scalar <<
"for many users.\n");
8718 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8719 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8720 ExternalUsesWithNonUsers.insert(Scalar);
8725 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8726 if (ExtI != ExternallyUsedValues.
end()) {
8727 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8728 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8729 << FoundLane <<
" from " << *Scalar <<
".\n");
8730 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8731 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8742 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8747 !UseEntries.
empty()) {
8751 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8754 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8755 return UseEntry->State == TreeEntry::ScatterVectorize ||
8757 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8760 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
8763 [](TreeEntry *UseEntry) {
8764 return UseEntry->isGather();
8770 if (It != ScalarToExtUses.
end()) {
8771 ExternalUses[It->second].User =
nullptr;
8776 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
8778 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8780 <<
" from lane " << FoundLane <<
" from " << *Scalar
8782 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8783 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8784 ExternalUsesWithNonUsers.insert(Scalar);
8793BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
8797 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
8798 Value *V = TE->Scalars[Lane];
8811 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
8820 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
8821 SI->getValueOperand()->getType(),
Ptr}];
8824 if (StoresVec.size() > Lane)
8826 if (!StoresVec.empty()) {
8828 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
8829 SI->getValueOperand()->getType(),
8830 StoresVec.front()->getPointerOperand(), *
DL, *SE,
8836 StoresVec.push_back(
SI);
8841 for (
auto &
P : PtrToStoresMap) {
8856 StoreInst *S0 = StoresVec[0];
8861 StoreInst *
SI = StoresVec[Idx];
8862 std::optional<int64_t> Diff =
8864 SI->getPointerOperand(), *DL, *SE,
8870 if (StoreOffsetVec.
size() != StoresVec.
size())
8872 sort(StoreOffsetVec, llvm::less_first());
8874 int64_t PrevDist = 0;
8875 for (
const auto &
P : StoreOffsetVec) {
8876 if (Idx > 0 &&
P.first != PrevDist + 1)
8884 ReorderIndices.assign(StoresVec.
size(), 0);
8885 bool IsIdentity =
true;
8887 ReorderIndices[
P.second] =
I;
8888 IsIdentity &=
P.second ==
I;
8894 ReorderIndices.clear();
8901 for (
unsigned Idx : Order)
8902 dbgs() << Idx <<
", ";
8908BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
8909 unsigned NumLanes =
TE->Scalars.size();
8922 if (StoresVec.
size() != NumLanes)
8927 if (!canFormVector(StoresVec, ReorderIndices))
8932 ExternalReorderIndices.
push_back(ReorderIndices);
8934 return ExternalReorderIndices;
8940 assert(TreeEntryToStridedPtrInfoMap.empty() &&
8941 "TreeEntryToStridedPtrInfoMap is not cleared");
8942 UserIgnoreList = &UserIgnoreLst;
8945 buildTreeRec(Roots, 0,
EdgeInfo());
8950 assert(TreeEntryToStridedPtrInfoMap.empty() &&
8951 "TreeEntryToStridedPtrInfoMap is not cleared");
8954 buildTreeRec(Roots, 0,
EdgeInfo());
8963 bool AddNew =
true) {
8971 for (
Value *V : VL) {
8975 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
8977 bool IsFound =
false;
8978 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
8979 assert(LI->getParent() ==
Data.front().first->getParent() &&
8980 LI->getType() ==
Data.front().first->getType() &&
8984 "Expected loads with the same type, same parent and same "
8985 "underlying pointer.");
8987 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
8988 Data.front().first->getPointerOperand(),
DL, SE,
8992 auto It = Map.find(*Dist);
8993 if (It != Map.end() && It->second != LI)
8995 if (It == Map.end()) {
8996 Data.emplace_back(LI, *Dist);
8997 Map.try_emplace(*Dist, LI);
9007 auto FindMatchingLoads =
9012 int64_t &
Offset,
unsigned &Start) {
9014 return GatheredLoads.
end();
9023 std::optional<int64_t> Dist =
9025 Data.front().first->getType(),
9026 Data.front().first->getPointerOperand(),
DL, SE,
9032 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9038 unsigned NumUniques = 0;
9039 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9040 bool Used = DataLoads.
contains(Pair.first);
9041 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9045 Repeated.insert(Cnt);
9048 if (NumUniques > 0 &&
9049 (Loads.
size() == NumUniques ||
9050 (Loads.
size() - NumUniques >= 2 &&
9051 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9057 return std::next(GatheredLoads.
begin(), Idx);
9061 return GatheredLoads.
end();
9063 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9067 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9069 while (It != GatheredLoads.
end()) {
9070 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9071 for (
unsigned Idx : LocalToAdd)
9074 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9078 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9085 Loads.push_back(
Data[Idx]);
9091 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9092 return PD.front().first->getParent() == LI->
getParent() &&
9093 PD.front().first->getType() == LI->
getType();
9095 while (It != GatheredLoads.
end()) {
9098 std::next(It), GatheredLoads.
end(),
9099 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9100 return PD.front().first->getParent() == LI->getParent() &&
9101 PD.front().first->getType() == LI->getType();
9105 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9106 AddNewLoads(GatheredLoads.emplace_back());
9111void BoUpSLP::tryToVectorizeGatheredLoads(
9112 const SmallMapVector<
9113 std::tuple<BasicBlock *, Value *, Type *>,
9116 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9119 LoadEntriesToVectorize.size());
9120 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9121 Set.insert_range(VectorizableTree[Idx]->Scalars);
9124 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9125 const std::pair<LoadInst *, int64_t> &L2) {
9126 return L1.second > L2.second;
9133 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9134 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9135 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9140 SmallVectorImpl<LoadInst *> &NonVectorized,
9141 bool Final,
unsigned MaxVF) {
9143 unsigned StartIdx = 0;
9144 SmallVector<int> CandidateVFs;
9148 *TTI, Loads.
front()->getType(), MaxVF);
9150 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9156 if (Final && CandidateVFs.
empty())
9159 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9160 for (
unsigned NumElts : CandidateVFs) {
9161 if (Final && NumElts > BestVF)
9163 SmallVector<unsigned> MaskedGatherVectorized;
9164 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9168 if (VectorizedLoads.count(Slice.
front()) ||
9169 VectorizedLoads.count(Slice.
back()) ||
9175 bool AllowToVectorize =
false;
9178 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9181 for (LoadInst *LI : Slice) {
9183 if (LI->hasOneUse())
9189 if (
static_cast<unsigned int>(std::distance(
9190 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9192 if (!IsLegalBroadcastLoad)
9196 for (User *U : LI->users()) {
9199 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9200 for (
int I :
seq<int>(UTE->getNumOperands())) {
9202 return V == LI || isa<PoisonValue>(V);
9212 AllowToVectorize = CheckIfAllowed(Slice);
9216 any_of(ValueToGatherNodes.at(Slice.front()),
9217 [=](
const TreeEntry *TE) {
9218 return TE->Scalars.size() == 2 &&
9219 ((TE->Scalars.front() == Slice.front() &&
9220 TE->Scalars.back() == Slice.back()) ||
9221 (TE->Scalars.front() == Slice.back() &&
9222 TE->Scalars.back() == Slice.front()));
9227 if (AllowToVectorize) {
9232 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9233 StridedPtrInfo SPtrInfo;
9235 PointerOps, SPtrInfo, &BestVF);
9237 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9239 if (MaskedGatherVectorized.
empty() ||
9240 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9245 Results.emplace_back(Values, LS);
9246 VectorizedLoads.insert_range(Slice);
9249 if (Cnt == StartIdx)
9250 StartIdx += NumElts;
9253 if (StartIdx >= Loads.
size())
9257 if (!MaskedGatherVectorized.
empty() &&
9258 Cnt < MaskedGatherVectorized.
back() + NumElts)
9264 if (!AllowToVectorize || BestVF == 0)
9268 for (
unsigned Cnt : MaskedGatherVectorized) {
9270 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9274 VectorizedLoads.insert_range(Slice);
9276 if (Cnt == StartIdx)
9277 StartIdx += NumElts;
9280 for (LoadInst *LI : Loads) {
9281 if (!VectorizedLoads.contains(LI))
9282 NonVectorized.push_back(LI);
9286 auto ProcessGatheredLoads =
9289 bool Final =
false) {
9291 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9293 if (LoadsDists.size() <= 1) {
9294 NonVectorized.
push_back(LoadsDists.back().first);
9302 unsigned MaxConsecutiveDistance = 0;
9303 unsigned CurrentConsecutiveDist = 1;
9304 int64_t LastDist = LocalLoadsDists.front().second;
9305 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9306 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9309 assert(LastDist >=
L.second &&
9310 "Expected first distance always not less than second");
9311 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9312 CurrentConsecutiveDist) {
9313 ++CurrentConsecutiveDist;
9314 MaxConsecutiveDistance =
9315 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9319 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9322 CurrentConsecutiveDist = 1;
9323 LastDist =
L.second;
9326 if (Loads.
size() <= 1)
9328 if (AllowMaskedGather)
9329 MaxConsecutiveDistance = Loads.
size();
9330 else if (MaxConsecutiveDistance < 2)
9335 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9336 Final, MaxConsecutiveDistance);
9338 OriginalLoads.size() == Loads.
size() &&
9339 MaxConsecutiveDistance == Loads.
size() &&
9344 VectorizedLoads.
clear();
9348 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9349 UnsortedNonVectorized, Final,
9350 OriginalLoads.size());
9351 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9352 SortedNonVectorized.
swap(UnsortedNonVectorized);
9353 Results.swap(UnsortedResults);
9358 << Slice.
size() <<
")\n");
9360 for (
Value *L : Slice)
9368 unsigned MaxVF = Slice.size();
9369 unsigned UserMaxVF = 0;
9370 unsigned InterleaveFactor = 0;
9375 std::optional<unsigned> InterleavedLoadsDistance = 0;
9377 std::optional<unsigned> CommonVF = 0;
9378 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9379 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9380 for (
auto [Idx, V] :
enumerate(Slice)) {
9381 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9382 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9385 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9387 if (*CommonVF == 0) {
9388 CommonVF =
E->Scalars.size();
9391 if (*CommonVF !=
E->Scalars.size())
9395 if (Pos != Idx && InterleavedLoadsDistance) {
9398 if (isa<Constant>(V))
9400 if (isVectorized(V))
9402 const auto &Nodes = ValueToGatherNodes.at(V);
9403 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9404 !is_contained(Slice, V);
9406 InterleavedLoadsDistance.reset();
9410 if (*InterleavedLoadsDistance == 0) {
9411 InterleavedLoadsDistance = Idx - Pos;
9414 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9415 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9416 InterleavedLoadsDistance.reset();
9417 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9421 DeinterleavedNodes.
clear();
9423 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9424 CommonVF.value_or(0) != 0) {
9425 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9426 unsigned VF = *CommonVF;
9429 StridedPtrInfo SPtrInfo;
9431 if (InterleaveFactor <= Slice.size() &&
9432 TTI.isLegalInterleavedAccessType(
9440 UserMaxVF = InterleaveFactor * VF;
9442 InterleaveFactor = 0;
9447 unsigned ConsecutiveNodesSize = 0;
9448 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9449 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9450 [&, Slice = Slice](
const auto &
P) {
9452 return std::get<1>(
P).contains(V);
9454 if (It == Slice.end())
9456 const TreeEntry &
TE =
9457 *VectorizableTree[std::get<0>(
P)];
9461 StridedPtrInfo SPtrInfo;
9463 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9467 ConsecutiveNodesSize += VL.
size();
9468 size_t Start = std::distance(Slice.begin(), It);
9469 size_t Sz = Slice.size() -
Start;
9470 return Sz < VL.
size() ||
9471 Slice.slice(Start, VL.
size()) != VL;
9476 if (InterleaveFactor == 0 &&
9478 [&, Slice = Slice](
unsigned Idx) {
9480 SmallVector<Value *> PointerOps;
9481 StridedPtrInfo SPtrInfo;
9482 return canVectorizeLoads(
9483 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9484 Slice[Idx * UserMaxVF], Order, PointerOps,
9485 SPtrInfo) == LoadsState::ScatterVectorize;
9488 if (Slice.size() != ConsecutiveNodesSize)
9489 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9491 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9492 bool IsVectorized =
true;
9493 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
9495 Slice.slice(
I, std::min(VF,
E -
I));
9500 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9501 [&](
const auto &
P) {
9503 VectorizableTree[std::get<0>(
P)]
9508 unsigned Sz = VectorizableTree.size();
9509 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
9510 if (Sz == VectorizableTree.size()) {
9511 IsVectorized =
false;
9514 if (InterleaveFactor > 0) {
9515 VF = 2 * (MaxVF / InterleaveFactor);
9516 InterleaveFactor = 0;
9525 NonVectorized.
append(SortedNonVectorized);
9527 return NonVectorized;
9529 for (
const auto &GLs : GatheredLoads) {
9530 const auto &
Ref = GLs.second;
9532 if (!
Ref.empty() && !NonVectorized.
empty() &&
9534 Ref.begin(),
Ref.end(), 0u,
9535 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9536 ->
unsigned { return S + LoadsDists.size(); }) !=
9537 NonVectorized.
size() &&
9538 IsMaskedGatherSupported(NonVectorized)) {
9541 for (LoadInst *LI : NonVectorized) {
9549 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9553 for (
unsigned Idx : LoadEntriesToVectorize) {
9554 const TreeEntry &
E = *VectorizableTree[Idx];
9557 if (!
E.ReorderIndices.empty()) {
9560 SmallVector<int> ReorderMask;
9564 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
9568 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9569 VectorizableTree.size())
9570 GatheredLoadsEntriesFirst.reset();
9580 bool AllowAlternate) {
9603 isValidForAlternation(
I->getOpcode())) {
9615 std::pair<size_t, size_t> OpVals =
9623 if (CI->isCommutative())
9645 SubKey =
hash_value(Gep->getPointerOperand());
9657 return std::make_pair(
Key, SubKey);
9663 Instruction *AltOp,
const TargetLibraryInfo &TLI);
9665bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9667 Type *ScalarTy = S.getMainOp()->getType();
9668 unsigned Opcode0 = S.getOpcode();
9669 unsigned Opcode1 = S.getAltOpcode();
9670 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9673 Opcode1, OpcodeMask))
9676 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
9679 for (
Value *V : VL) {
9696 switch (Res.value_or(0)) {
9710 DenseSet<unsigned> UniqueOpcodes;
9711 constexpr unsigned NumAltInsts = 3;
9712 unsigned NonInstCnt = 0;
9715 unsigned UndefCnt = 0;
9717 unsigned ExtraShuffleInsts = 0;
9726 return is_contained(Operands.back(), V);
9729 ++ExtraShuffleInsts;
9732 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
9744 DenseMap<Value *, unsigned> Uniques;
9754 if (!Res.second && Res.first->second == 1)
9755 ++ExtraShuffleInsts;
9756 ++Res.first->getSecond();
9758 UniqueOpcodes.
insert(
I->getOpcode());
9759 else if (Res.second)
9762 return none_of(Uniques, [&](
const auto &
P) {
9763 return P.first->hasNUsesOrMore(
P.second + 1) &&
9764 none_of(
P.first->users(), [&](User *U) {
9765 return isVectorized(U) || Uniques.contains(U);
9774 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9775 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
9776 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9783 const unsigned VF,
unsigned MinBW,
9806static std::pair<InstructionCost, InstructionCost>
9826 FMF = FPCI->getFastMathFlags();
9829 LibCost.isValid() ? LibCost : ScalarLimit);
9839BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9841 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
9842 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9844 "Expected instructions with same/alternate opcodes only.");
9846 unsigned ShuffleOrOp =
9847 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
9849 switch (ShuffleOrOp) {
9850 case Instruction::PHI: {
9853 return TreeEntry::NeedToGather;
9855 for (
Value *V : VL) {
9859 for (
Value *Incoming :
PHI->incoming_values()) {
9861 if (Term &&
Term->isTerminator()) {
9863 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
9864 return TreeEntry::NeedToGather;
9869 return TreeEntry::Vectorize;
9871 case Instruction::ExtractElement:
9878 return TreeEntry::NeedToGather;
9880 case Instruction::ExtractValue: {
9881 bool Reuse = canReuseExtract(VL, CurrentOrder);
9885 return TreeEntry::NeedToGather;
9886 if (Reuse || !CurrentOrder.empty())
9887 return TreeEntry::Vectorize;
9889 return TreeEntry::NeedToGather;
9891 case Instruction::InsertElement: {
9895 for (
Value *V : VL) {
9897 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
9898 return TreeEntry::NeedToGather;
9902 "Non-constant or undef index?");
9906 return !SourceVectors.contains(V);
9909 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9910 "different source vectors.\n");
9911 return TreeEntry::NeedToGather;
9916 return SourceVectors.contains(V) && !
V->hasOneUse();
9919 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9920 "multiple uses.\n");
9921 return TreeEntry::NeedToGather;
9924 return TreeEntry::Vectorize;
9926 case Instruction::Load: {
9933 auto IsGatheredNode = [&]() {
9934 if (!GatheredLoadsEntriesFirst)
9939 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
9940 return TE->Idx >= *GatheredLoadsEntriesFirst;
9946 return TreeEntry::Vectorize;
9948 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9950 LoadEntriesToVectorize.insert(VectorizableTree.size());
9951 return TreeEntry::NeedToGather;
9953 return IsGatheredNode() ? TreeEntry::NeedToGather
9954 : TreeEntry::CompressVectorize;
9956 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9958 LoadEntriesToVectorize.insert(VectorizableTree.size());
9959 return TreeEntry::NeedToGather;
9961 return IsGatheredNode() ? TreeEntry::NeedToGather
9962 : TreeEntry::ScatterVectorize;
9964 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9966 LoadEntriesToVectorize.insert(VectorizableTree.size());
9967 return TreeEntry::NeedToGather;
9969 return IsGatheredNode() ? TreeEntry::NeedToGather
9970 : TreeEntry::StridedVectorize;
9974 if (DL->getTypeSizeInBits(ScalarTy) !=
9975 DL->getTypeAllocSizeInBits(ScalarTy))
9976 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
9979 return !LI || !LI->isSimple();
9986 return TreeEntry::NeedToGather;
9990 case Instruction::ZExt:
9991 case Instruction::SExt:
9992 case Instruction::FPToUI:
9993 case Instruction::FPToSI:
9994 case Instruction::FPExt:
9995 case Instruction::PtrToInt:
9996 case Instruction::IntToPtr:
9997 case Instruction::SIToFP:
9998 case Instruction::UIToFP:
9999 case Instruction::Trunc:
10000 case Instruction::FPTrunc:
10001 case Instruction::BitCast: {
10003 for (
Value *V : VL) {
10009 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10010 return TreeEntry::NeedToGather;
10013 return TreeEntry::Vectorize;
10015 case Instruction::ICmp:
10016 case Instruction::FCmp: {
10021 for (
Value *V : VL) {
10025 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10026 Cmp->getOperand(0)->getType() != ComparedTy) {
10027 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10028 return TreeEntry::NeedToGather;
10031 return TreeEntry::Vectorize;
10033 case Instruction::Select:
10034 case Instruction::FNeg:
10035 case Instruction::Add:
10036 case Instruction::FAdd:
10037 case Instruction::Sub:
10038 case Instruction::FSub:
10039 case Instruction::Mul:
10040 case Instruction::FMul:
10041 case Instruction::UDiv:
10042 case Instruction::SDiv:
10043 case Instruction::FDiv:
10044 case Instruction::URem:
10045 case Instruction::SRem:
10046 case Instruction::FRem:
10047 case Instruction::Shl:
10048 case Instruction::LShr:
10049 case Instruction::AShr:
10050 case Instruction::And:
10051 case Instruction::Or:
10052 case Instruction::Xor:
10053 case Instruction::Freeze:
10054 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10055 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10057 return I &&
I->isBinaryOp() && !
I->isFast();
10059 return TreeEntry::NeedToGather;
10060 return TreeEntry::Vectorize;
10061 case Instruction::GetElementPtr: {
10063 for (
Value *V : VL) {
10067 if (
I->getNumOperands() != 2) {
10068 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10069 return TreeEntry::NeedToGather;
10076 for (
Value *V : VL) {
10080 Type *CurTy =
GEP->getSourceElementType();
10081 if (Ty0 != CurTy) {
10082 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10083 return TreeEntry::NeedToGather;
10089 for (
Value *V : VL) {
10093 auto *
Op =
I->getOperand(1);
10095 (
Op->getType() != Ty1 &&
10097 Op->getType()->getScalarSizeInBits() >
10098 DL->getIndexSizeInBits(
10099 V->getType()->getPointerAddressSpace())))) {
10101 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10102 return TreeEntry::NeedToGather;
10106 return TreeEntry::Vectorize;
10108 case Instruction::Store: {
10110 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10113 if (DL->getTypeSizeInBits(ScalarTy) !=
10114 DL->getTypeAllocSizeInBits(ScalarTy)) {
10115 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10116 return TreeEntry::NeedToGather;
10120 for (
Value *V : VL) {
10122 if (!
SI->isSimple()) {
10124 return TreeEntry::NeedToGather;
10133 if (CurrentOrder.empty()) {
10134 Ptr0 = PointerOps.
front();
10135 PtrN = PointerOps.
back();
10137 Ptr0 = PointerOps[CurrentOrder.front()];
10138 PtrN = PointerOps[CurrentOrder.back()];
10140 std::optional<int64_t> Dist =
10143 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10144 return TreeEntry::Vectorize;
10148 return TreeEntry::NeedToGather;
10150 case Instruction::Call: {
10151 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10152 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10154 return I && !
I->isFast();
10156 return TreeEntry::NeedToGather;
10166 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10170 return TreeEntry::NeedToGather;
10173 unsigned NumArgs = CI->
arg_size();
10175 for (
unsigned J = 0; J != NumArgs; ++J)
10178 for (
Value *V : VL) {
10183 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10185 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10187 return TreeEntry::NeedToGather;
10191 for (
unsigned J = 0; J != NumArgs; ++J) {
10194 if (ScalarArgs[J] != A1J) {
10196 <<
"SLP: mismatched arguments in call:" << *CI
10197 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10198 return TreeEntry::NeedToGather;
10207 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10208 <<
"!=" << *V <<
'\n');
10209 return TreeEntry::NeedToGather;
10214 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10216 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10217 return TreeEntry::NeedToGather;
10219 return TreeEntry::Vectorize;
10221 case Instruction::ShuffleVector: {
10222 if (!S.isAltShuffle()) {
10225 return TreeEntry::Vectorize;
10228 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10229 return TreeEntry::NeedToGather;
10234 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10235 "the whole alt sequence is not profitable.\n");
10236 return TreeEntry::NeedToGather;
10239 return TreeEntry::Vectorize;
10243 return TreeEntry::NeedToGather;
10252 PHINode *Main =
nullptr;
10257 PHIHandler() =
delete;
10259 : DT(DT), Main(Main), Phis(Phis),
10260 Operands(Main->getNumIncomingValues(),
10262 void buildOperands() {
10263 constexpr unsigned FastLimit = 4;
10272 for (
auto [Idx, V] :
enumerate(Phis)) {
10276 "Expected isa instruction or poison value.");
10280 if (
P->getIncomingBlock(
I) == InBB)
10283 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10288 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10298 for (
auto [Idx, V] :
enumerate(Phis)) {
10313 auto *It = Blocks.
find(InBB);
10314 if (It == Blocks.
end())
10316 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10319 for (
const auto &
P : Blocks) {
10320 ArrayRef<unsigned> IncomingValues =
P.second;
10321 if (IncomingValues.
size() <= 1)
10324 for (
unsigned I : IncomingValues) {
10326 [&](
const auto &
Data) {
10327 return !
Data.value() ||
10330 "Expected empty operands list.");
10344static std::pair<Instruction *, Instruction *>
10348 for (
Value *V : VL) {
10358 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10377 "Expected different main and alt instructions.");
10378 return std::make_pair(MainOp, AltOp);
10391 const InstructionsState &S,
10393 bool TryPad =
false) {
10397 for (
Value *V : VL) {
10413 size_t NumUniqueScalarValues = UniqueValues.
size();
10416 if (NumUniqueScalarValues == VL.
size() &&
10418 ReuseShuffleIndices.
clear();
10423 if ((UserTreeIdx.
UserTE &&
10424 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10426 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10427 "for nodes with padding.\n");
10428 ReuseShuffleIndices.
clear();
10433 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10437 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10438 S.getMainOp()->isSafeToRemove() &&
10439 (S.areInstructionsWithCopyableElements() ||
10443 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10444 PWSz = std::min<unsigned>(PWSz, VL.
size());
10445 if (PWSz == VL.
size()) {
10449 ReuseShuffleIndices.
clear();
10453 UniqueValues.
end());
10454 PaddedUniqueValues.
append(
10455 PWSz - UniqueValues.
size(),
10459 if (!S.areInstructionsWithCopyableElements() &&
10462 ReuseShuffleIndices.
clear();
10465 VL = std::move(PaddedUniqueValues);
10470 ReuseShuffleIndices.
clear();
10473 VL = std::move(UniqueValues);
10478 const InstructionsState &LocalState,
10479 SmallVectorImpl<Value *> &Op1,
10480 SmallVectorImpl<Value *> &Op2,
10482 constexpr unsigned SmallNodeSize = 4;
10483 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10488 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10490 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
10491 if (
E->isSame(VL)) {
10493 << *LocalState.getMainOp() <<
".\n");
10505 ReorderIndices.assign(VL.
size(), VL.
size());
10506 SmallBitVector Op1Indices(VL.
size());
10511 Op1Indices.set(Idx);
10514 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10517 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10519 LocalState.getAltOp(), *TLI))) {
10521 Op1Indices.set(Idx);
10528 unsigned Opcode0 = LocalState.getOpcode();
10529 unsigned Opcode1 = LocalState.getAltOpcode();
10530 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10535 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10536 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10541 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10543 if (Op1Indices.test(Idx)) {
10544 ReorderIndices[Op1Cnt] = Idx;
10547 ReorderIndices[Op2Cnt] = Idx;
10552 ReorderIndices.clear();
10553 SmallVector<int>
Mask;
10554 if (!ReorderIndices.empty())
10556 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10561 if (NumParts >= VL.
size())
10566 FixedVectorType *SubVecTy =
10570 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10571 (
Mask.empty() || InsertCost >= NewShuffleCost))
10573 if ((LocalState.getMainOp()->isBinaryOp() &&
10574 LocalState.getAltOp()->isBinaryOp() &&
10575 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10576 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10577 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10578 (LocalState.getMainOp()->isUnaryOp() &&
10579 LocalState.getAltOp()->isUnaryOp())) {
10581 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10582 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10587 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
10591 VecTy, OriginalMask, Kind);
10593 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10594 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10596 NewVecOpsCost + InsertCost +
10597 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10598 VectorizableTree.front()->getOpcode() == Instruction::Store
10602 if (NewCost >= OriginalCost)
10612class InstructionsCompatibilityAnalysis {
10614 const DataLayout &
DL;
10615 const TargetTransformInfo &
TTI;
10616 const TargetLibraryInfo &TLI;
10617 unsigned MainOpcode = 0;
10622 static bool isSupportedOpcode(
const unsigned Opcode) {
10623 return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10634 return I && isSupportedOpcode(
I->getOpcode()) &&
10639 SmallDenseSet<Value *, 8>
Operands;
10640 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10641 for (
Value *V : VL) {
10647 if (Candidates.
empty()) {
10648 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10650 Operands.insert(
I->op_begin(),
I->op_end());
10653 if (Parent ==
I->getParent()) {
10654 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10655 Operands.insert(
I->op_begin(),
I->op_end());
10658 auto *NodeA = DT.
getNode(Parent);
10659 auto *NodeB = DT.
getNode(
I->getParent());
10660 assert(NodeA &&
"Should only process reachable instructions");
10661 assert(NodeB &&
"Should only process reachable instructions");
10662 assert((NodeA == NodeB) ==
10663 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10664 "Different nodes should have different DFS numbers");
10665 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10666 Candidates.
clear();
10667 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10670 Operands.insert(
I->op_begin(),
I->op_end());
10673 unsigned BestOpcodeNum = 0;
10675 for (
const auto &
P : Candidates) {
10676 if (
P.second.size() < BestOpcodeNum)
10678 for (Instruction *
I :
P.second) {
10679 if (IsSupportedInstruction(
I) && !
Operands.contains(
I)) {
10681 BestOpcodeNum =
P.second.size();
10691 return I &&
I->getParent() == MainOp->
getParent() &&
10704 Value *selectBestIdempotentValue()
const {
10705 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10716 if (!S.isCopyableElement(V))
10718 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10719 return {
V, selectBestIdempotentValue()};
10725 SmallVectorImpl<BoUpSLP::ValueList> &
Operands)
const {
10727 unsigned ShuffleOrOp =
10728 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10731 switch (ShuffleOrOp) {
10732 case Instruction::PHI: {
10736 PHIHandler Handler(DT, PH, VL);
10737 Handler.buildOperands();
10738 Operands.assign(PH->getNumOperands(), {});
10740 Operands[
I].assign(Handler.getOperands(
I).begin(),
10741 Handler.getOperands(
I).end());
10744 case Instruction::ExtractValue:
10745 case Instruction::ExtractElement:
10750 case Instruction::InsertElement:
10758 case Instruction::Load:
10766 Op = LI->getPointerOperand();
10769 case Instruction::ZExt:
10770 case Instruction::SExt:
10771 case Instruction::FPToUI:
10772 case Instruction::FPToSI:
10773 case Instruction::FPExt:
10774 case Instruction::PtrToInt:
10775 case Instruction::IntToPtr:
10776 case Instruction::SIToFP:
10777 case Instruction::UIToFP:
10778 case Instruction::Trunc:
10779 case Instruction::FPTrunc:
10780 case Instruction::BitCast:
10781 case Instruction::ICmp:
10782 case Instruction::FCmp:
10783 case Instruction::Select:
10784 case Instruction::FNeg:
10785 case Instruction::Add:
10786 case Instruction::FAdd:
10787 case Instruction::Sub:
10788 case Instruction::FSub:
10789 case Instruction::Mul:
10790 case Instruction::FMul:
10791 case Instruction::UDiv:
10792 case Instruction::SDiv:
10793 case Instruction::FDiv:
10794 case Instruction::URem:
10795 case Instruction::SRem:
10796 case Instruction::FRem:
10797 case Instruction::Shl:
10798 case Instruction::LShr:
10799 case Instruction::AShr:
10800 case Instruction::And:
10801 case Instruction::Or:
10802 case Instruction::Xor:
10803 case Instruction::Freeze:
10804 case Instruction::Store:
10805 case Instruction::ShuffleVector:
10814 auto [
Op, ConvertedOps] = convertTo(
I, S);
10819 case Instruction::GetElementPtr: {
10826 const unsigned IndexIdx = 1;
10832 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
10835 :
DL.getIndexType(
cast<GetElementPtrInst>(VL0)
10836 ->getPointerOperandType()
10837 ->getScalarType());
10842 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10846 auto *
Op =
GEP->getOperand(IndexIdx);
10849 CI, Ty, CI->getValue().isSignBitSet(),
DL)
10854 case Instruction::Call: {
10861 for (
Value *V : VL) {
10863 Ops.push_back(
I ?
I->getOperand(Idx)
10876 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
10877 const TargetTransformInfo &
TTI,
10878 const TargetLibraryInfo &TLI)
10883 bool TryCopyableElementsVectorization,
10884 bool WithProfitabilityCheck =
false,
10885 bool SkipSameCodeCheck =
false) {
10886 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
10887 ? InstructionsState::invalid()
10893 findAndSetMainInstruction(VL, R);
10895 return InstructionsState::invalid();
10896 S = InstructionsState(MainOp, MainOp,
true);
10897 if (!WithProfitabilityCheck)
10901 auto BuildCandidates =
10902 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
10908 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
10909 I1->getParent() != I2->getParent())
10913 if (VL.
size() == 2) {
10918 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10919 R.findBestRootPair(Candidates1) &&
10920 R.findBestRootPair(Candidates2);
10922 Candidates1.
clear();
10923 Candidates2.
clear();
10926 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10927 R.findBestRootPair(Candidates1) &&
10928 R.findBestRootPair(Candidates2);
10931 return InstructionsState::invalid();
10935 FixedVectorType *VecTy =
10937 switch (MainOpcode) {
10938 case Instruction::Add:
10939 case Instruction::LShr:
10945 if (VectorCost > ScalarCost)
10946 return InstructionsState::invalid();
10949 assert(
Operands.size() == 2 &&
"Unexpected number of operands!");
10950 unsigned CopyableNum =
10951 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
10952 if (CopyableNum < VL.
size() / 2)
10955 const unsigned Limit = VL.
size() / 24;
10956 if ((CopyableNum >= VL.
size() - Limit ||
10957 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
10962 return InstructionsState::invalid();
10981 return InstructionsState::invalid();
10987 constexpr unsigned Limit = 4;
10988 if (
Operands.front().size() >= Limit) {
10989 SmallDenseMap<const Value *, unsigned>
Counters;
10997 return C.second == 1;
11003 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11004 InstructionsState OpS =
Analysis.buildInstructionsState(
11006 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11008 unsigned CopyableNum =
11010 return CopyableNum <= VL.
size() / 2;
11012 if (!CheckOperand(
Operands.front()))
11013 return InstructionsState::invalid();
11020 assert(S &&
"Invalid state!");
11022 if (S.areInstructionsWithCopyableElements()) {
11023 MainOp = S.getMainOp();
11024 MainOpcode = S.getOpcode();
11029 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11030 Operands[OperandIdx][Idx] = Operand;
11033 buildOriginalOperands(S, VL,
Operands);
11040BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11042 bool TryCopyableElementsVectorization)
const {
11045 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11046 InstructionsState S =
Analysis.buildInstructionsState(
11047 VL, *
this, TryCopyableElementsVectorization,
11048 true, TryCopyableElementsVectorization);
11056 return ScalarsVectorizationLegality(S,
false,
11062 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11063 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11064 if (
E->isSame(VL)) {
11065 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11067 return ScalarsVectorizationLegality(S,
false);
11072 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11073 LI->getLoopFor(S.getMainOp()->getParent()) &&
11077 return ScalarsVectorizationLegality(S,
false);
11086 !(S && !S.isAltShuffle() && VL.
size() >= 4 &&
11093 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11094 return ScalarsVectorizationLegality(S,
false);
11098 if (S && S.getOpcode() == Instruction::ExtractElement &&
11101 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11102 return ScalarsVectorizationLegality(S,
false);
11109 return ScalarsVectorizationLegality(S,
false,
11119 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11127 SmallVector<unsigned, 8> InstsCount;
11128 for (
Value *V : VL) {
11131 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11134 bool IsCommutative =
11136 if ((IsCommutative &&
11137 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11139 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11141 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11145 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11147 I2->getOperand(
Op));
11148 if (
static_cast<unsigned>(
count_if(
11149 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11151 })) >= S.getMainOp()->getNumOperands() / 2)
11153 if (S.getMainOp()->getNumOperands() > 2)
11155 if (IsCommutative) {
11157 Candidates.
clear();
11158 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11160 I2->getOperand((
Op + 1) %
E));
11162 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11169 SmallVector<unsigned> SortedIndices;
11171 bool IsScatterVectorizeUserTE =
11172 UserTreeIdx.UserTE &&
11173 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11174 bool AreAllSameBlock = S.valid();
11175 bool AreScatterAllGEPSameBlock =
11188 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11190 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11196 NotProfitableForVectorization(VL)) {
11198 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11199 "C,S,B,O, small shuffle. \n";
11203 return ScalarsVectorizationLegality(S,
false,
11207 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11211 return ScalarsVectorizationLegality(S,
false);
11215 if (S && !EphValues.empty()) {
11216 for (
Value *V : VL) {
11217 if (EphValues.count(V)) {
11219 <<
") is ephemeral.\n");
11221 return ScalarsVectorizationLegality(S,
false,
11233 if (S && S.isAltShuffle()) {
11234 auto GetNumVectorizedExtracted = [&]() {
11240 all_of(
I->operands(), [&](
const Use &U) {
11241 return isa<ExtractElementInst>(U.get());
11246 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11249 return std::make_pair(Vectorized, Extracted);
11251 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11253 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11254 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11257 Type *ScalarTy = VL.front()->getType();
11262 false,
true, Kind);
11264 *TTI, ScalarTy, VecTy, Vectorized,
11265 true,
false, Kind,
false);
11266 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11268 if (PreferScalarize) {
11269 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11270 "node is not profitable.\n");
11271 return ScalarsVectorizationLegality(S,
false);
11276 if (UserIgnoreList && !UserIgnoreList->empty()) {
11277 for (
Value *V : VL) {
11278 if (UserIgnoreList->contains(V)) {
11279 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11280 return ScalarsVectorizationLegality(S,
false);
11287 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11288 assert(VL.front()->getType()->isPointerTy() &&
11290 "Expected pointers only.");
11293 assert(It != VL.end() &&
"Expected at least one GEP.");
11304 !DT->isReachableFromEntry(BB))) {
11310 return ScalarsVectorizationLegality(S,
false);
11312 return ScalarsVectorizationLegality(S,
true);
11317 unsigned InterleaveFactor) {
11320 SmallVector<int> ReuseShuffleIndices;
11324 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11327 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11330 auto Invalid = ScheduleBundle::invalid();
11331 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11332 UserTreeIdx, {}, ReorderIndices);
11337 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11339 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11340 Idx == 0 ? 0 : Op1.
size());
11341 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11343 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11344 Idx == 0 ? 0 : Op1.
size());
11354 bool AreConsts =
false;
11355 for (
Value *V : VL) {
11367 if (AreOnlyConstsWithPHIs(VL)) {
11368 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11369 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11373 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11374 VL,
Depth, UserTreeIdx,
false);
11375 InstructionsState S = Legality.getInstructionsState();
11376 if (!Legality.isLegal()) {
11377 if (Legality.trySplitVectorize()) {
11380 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11384 Legality = getScalarsVectorizationLegality(
11385 VL,
Depth, UserTreeIdx,
true);
11386 if (!Legality.isLegal()) {
11387 if (Legality.tryToFindDuplicates())
11391 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11394 S = Legality.getInstructionsState();
11398 if (S.isAltShuffle() && TrySplitNode(S))
11404 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11409 bool IsScatterVectorizeUserTE =
11410 UserTreeIdx.UserTE &&
11411 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11414 StridedPtrInfo SPtrInfo;
11415 TreeEntry::EntryState State = getScalarsVectorizationState(
11416 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11417 if (State == TreeEntry::NeedToGather) {
11418 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11424 auto &BSRef = BlocksSchedules[BB];
11426 BSRef = std::make_unique<BlockScheduling>(BB);
11428 BlockScheduling &BS = *BSRef;
11431 std::optional<ScheduleBundle *> BundlePtr =
11432 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11433#ifdef EXPENSIVE_CHECKS
11437 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11438 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11440 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11442 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11443 NonScheduledFirst.insert(VL.front());
11444 if (S.getOpcode() == Instruction::Load &&
11445 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11449 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11451 ScheduleBundle
Empty;
11452 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11453 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11455 unsigned ShuffleOrOp =
11456 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11457 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
11459 SmallVector<unsigned> PHIOps;
11465 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11470 for (
unsigned I : PHIOps)
11473 switch (ShuffleOrOp) {
11474 case Instruction::PHI: {
11476 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11484 case Instruction::ExtractValue:
11485 case Instruction::ExtractElement: {
11486 if (CurrentOrder.empty()) {
11487 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11490 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11492 for (
unsigned Idx : CurrentOrder)
11493 dbgs() <<
" " << Idx;
11500 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11501 ReuseShuffleIndices, CurrentOrder);
11503 "(ExtractValueInst/ExtractElementInst).\n";
11510 case Instruction::InsertElement: {
11511 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11513 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11514 const std::pair<int, int> &P2) {
11515 return P1.first > P2.first;
11518 decltype(OrdCompare)>
11519 Indices(OrdCompare);
11520 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11522 Indices.emplace(Idx,
I);
11524 OrdersType CurrentOrder(VL.size(), VL.size());
11525 bool IsIdentity =
true;
11526 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11527 CurrentOrder[Indices.top().second] =
I;
11528 IsIdentity &= Indices.top().second ==
I;
11532 CurrentOrder.clear();
11533 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11535 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11539 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11542 case Instruction::Load: {
11549 TreeEntry *
TE =
nullptr;
11552 case TreeEntry::Vectorize:
11553 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11554 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11555 if (CurrentOrder.empty())
11556 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11560 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11563 case TreeEntry::CompressVectorize:
11565 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11566 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11569 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11572 case TreeEntry::StridedVectorize:
11574 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11575 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11576 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
11577 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11580 case TreeEntry::ScatterVectorize:
11582 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11583 UserTreeIdx, ReuseShuffleIndices);
11586 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11589 case TreeEntry::CombinedVectorize:
11590 case TreeEntry::SplitVectorize:
11591 case TreeEntry::NeedToGather:
11594 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11595 assert(
Operands.size() == 1 &&
"Expected a single operand only");
11596 SmallVector<int>
Mask;
11601 if (State == TreeEntry::ScatterVectorize)
11602 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11605 case Instruction::ZExt:
11606 case Instruction::SExt:
11607 case Instruction::FPToUI:
11608 case Instruction::FPToSI:
11609 case Instruction::FPExt:
11610 case Instruction::PtrToInt:
11611 case Instruction::IntToPtr:
11612 case Instruction::SIToFP:
11613 case Instruction::UIToFP:
11614 case Instruction::Trunc:
11615 case Instruction::FPTrunc:
11616 case Instruction::BitCast: {
11617 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11618 std::make_pair(std::numeric_limits<unsigned>::min(),
11619 std::numeric_limits<unsigned>::max()));
11620 if (ShuffleOrOp == Instruction::ZExt ||
11621 ShuffleOrOp == Instruction::SExt) {
11622 CastMaxMinBWSizes = std::make_pair(
11623 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11625 std::min<unsigned>(
11628 }
else if (ShuffleOrOp == Instruction::Trunc) {
11629 CastMaxMinBWSizes = std::make_pair(
11630 std::max<unsigned>(
11633 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11636 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11637 ReuseShuffleIndices);
11638 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11643 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11644 if (ShuffleOrOp == Instruction::Trunc) {
11645 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11646 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11647 ShuffleOrOp == Instruction::UIToFP) {
11648 unsigned NumSignBits =
11651 APInt
Mask = DB->getDemandedBits(OpI);
11652 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11654 if (NumSignBits * 2 >=
11656 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11660 case Instruction::ICmp:
11661 case Instruction::FCmp: {
11664 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11665 ReuseShuffleIndices);
11674 "Commutative Predicate mismatch");
11684 if (
Cmp->getPredicate() != P0)
11691 if (ShuffleOrOp == Instruction::ICmp) {
11692 unsigned NumSignBits0 =
11694 if (NumSignBits0 * 2 >=
11696 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11697 unsigned NumSignBits1 =
11699 if (NumSignBits1 * 2 >=
11701 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11705 case Instruction::Select:
11706 case Instruction::FNeg:
11707 case Instruction::Add:
11708 case Instruction::FAdd:
11709 case Instruction::Sub:
11710 case Instruction::FSub:
11711 case Instruction::Mul:
11712 case Instruction::FMul:
11713 case Instruction::UDiv:
11714 case Instruction::SDiv:
11715 case Instruction::FDiv:
11716 case Instruction::URem:
11717 case Instruction::SRem:
11718 case Instruction::FRem:
11719 case Instruction::Shl:
11720 case Instruction::LShr:
11721 case Instruction::AShr:
11722 case Instruction::And:
11723 case Instruction::Or:
11724 case Instruction::Xor:
11725 case Instruction::Freeze: {
11726 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11727 ReuseShuffleIndices);
11729 dbgs() <<
"SLP: added a new TreeEntry "
11730 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11741 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11744 case Instruction::GetElementPtr: {
11745 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11746 ReuseShuffleIndices);
11747 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
11755 case Instruction::Store: {
11756 bool Consecutive = CurrentOrder.empty();
11759 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11760 ReuseShuffleIndices, CurrentOrder);
11762 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
11766 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
11769 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
11772 case Instruction::Call: {
11778 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11779 ReuseShuffleIndices);
11780 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
11794 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11798 case Instruction::ShuffleVector: {
11799 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11800 ReuseShuffleIndices);
11801 if (S.isAltShuffle()) {
11802 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
11807 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11821 "Expected different main/alternate predicates.");
11851 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11869 for (
const auto *Ty : ST->elements())
11870 if (Ty != *ST->element_begin())
11872 N *= ST->getNumElements();
11873 EltTy = *ST->element_begin();
11875 N *= AT->getNumElements();
11876 EltTy = AT->getElementType();
11879 N *= VT->getNumElements();
11880 EltTy = VT->getElementType();
11886 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
11887 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11888 VTSize != DL->getTypeStoreSizeInBits(
T))
11895 bool ResizeAllowed)
const {
11897 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
11904 Value *Vec = E0->getOperand(0);
11906 CurrentOrder.
clear();
11910 if (E0->getOpcode() == Instruction::ExtractValue) {
11922 unsigned E = VL.
size();
11923 if (!ResizeAllowed && NElts !=
E)
11926 unsigned MinIdx = NElts, MaxIdx = 0;
11931 if (Inst->getOperand(0) != Vec)
11939 const unsigned ExtIdx = *Idx;
11940 if (ExtIdx >= NElts)
11942 Indices[
I] = ExtIdx;
11943 if (MinIdx > ExtIdx)
11945 if (MaxIdx < ExtIdx)
11948 if (MaxIdx - MinIdx + 1 >
E)
11950 if (MaxIdx + 1 <=
E)
11954 bool ShouldKeepOrder =
true;
11961 for (
unsigned I = 0;
I <
E; ++
I) {
11964 const unsigned ExtIdx = Indices[
I] - MinIdx;
11965 if (CurrentOrder[ExtIdx] !=
E) {
11966 CurrentOrder.
clear();
11969 ShouldKeepOrder &= ExtIdx ==
I;
11970 CurrentOrder[ExtIdx] =
I;
11972 if (ShouldKeepOrder)
11973 CurrentOrder.
clear();
11975 return ShouldKeepOrder;
11978bool BoUpSLP::areAllUsersVectorized(
11979 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
11980 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
11981 all_of(
I->users(), [
this](User *U) {
11982 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
11983 (isa<ExtractElementInst>(U) && MustGather.contains(U));
11987void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
11988 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
11989 SmallVectorImpl<Value *> *OpScalars,
11990 SmallVectorImpl<Value *> *AltScalars)
const {
11991 unsigned Sz = Scalars.size();
11993 SmallVector<int> OrderMask;
11994 if (!ReorderIndices.empty())
11996 for (
unsigned I = 0;
I < Sz; ++
I) {
11998 if (!ReorderIndices.empty())
11999 Idx = OrderMask[
I];
12003 if (IsAltOp(OpInst)) {
12004 Mask[
I] = Sz + Idx;
12013 if (!ReuseShuffleIndices.
empty()) {
12015 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12016 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12018 Mask.swap(NewMask);
12025 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12035 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12044 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12045 "CmpInst expected to match either main or alternate predicate or "
12047 return MainP !=
P && MainP != SwappedP;
12049 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12054 const auto *Op0 =
Ops.front();
12067 return CI->getValue().isPowerOf2();
12073 return CI->getValue().isNegatedPowerOf2();
12078 if (IsConstant && IsUniform)
12080 else if (IsConstant)
12082 else if (IsUniform)
12094class BaseShuffleAnalysis {
12096 Type *ScalarTy =
nullptr;
12098 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12106 unsigned getVF(
Value *V)
const {
12107 assert(V &&
"V cannot be nullptr");
12109 "V does not have FixedVectorType");
12110 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12112 unsigned VNumElements =
12114 assert(VNumElements > ScalarTyNumElements &&
12115 "the number of elements of V is not large enough");
12116 assert(VNumElements % ScalarTyNumElements == 0 &&
12117 "the number of elements of V is not a vectorized value");
12118 return VNumElements / ScalarTyNumElements;
12124 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12126 int Limit =
Mask.size();
12138 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12139 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12152 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12153 ArrayRef<int> ExtMask) {
12154 unsigned VF =
Mask.size();
12156 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12159 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12163 Mask.swap(NewMask);
12199 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12200 bool SinglePermute) {
12202 ShuffleVectorInst *IdentityOp =
nullptr;
12203 SmallVector<int> IdentityMask;
12212 if (isIdentityMask(Mask, SVTy,
false)) {
12213 if (!IdentityOp || !SinglePermute ||
12214 (isIdentityMask(Mask, SVTy,
true) &&
12216 IdentityMask.
size()))) {
12221 IdentityMask.
assign(Mask);
12241 if (SV->isZeroEltSplat()) {
12243 IdentityMask.
assign(Mask);
12245 int LocalVF =
Mask.size();
12248 LocalVF = SVOpTy->getNumElements();
12252 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12254 ExtMask[Idx] = SV->getMaskValue(
I);
12264 if (!IsOp1Undef && !IsOp2Undef) {
12266 for (
int &
I : Mask) {
12269 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12275 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12276 combineMasks(LocalVF, ShuffleMask, Mask);
12277 Mask.swap(ShuffleMask);
12279 Op = SV->getOperand(0);
12281 Op = SV->getOperand(1);
12284 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12289 "Expected masks of same sizes.");
12294 Mask.swap(IdentityMask);
12296 return SinglePermute &&
12299 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12300 Shuffle->isZeroEltSplat() &&
12304 Shuffle->getShuffleMask()[
P.index()] == 0;
12317 template <
typename T,
typename ShuffleBuilderTy>
12318 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12319 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12320 assert(V1 &&
"Expected at least one vector value.");
12322 SmallVector<int> NewMask(Mask);
12323 if (ScalarTyNumElements != 1) {
12329 Builder.resizeToMatch(V1, V2);
12330 int VF =
Mask.size();
12332 VF = FTy->getNumElements();
12343 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12345 CombinedMask1[
I] =
Mask[
I];
12347 CombinedMask2[
I] =
Mask[
I] - VF;
12354 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12355 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12361 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12364 ExtMask1[Idx] = SV1->getMaskValue(
I);
12368 ->getNumElements(),
12369 ExtMask1, UseMask::SecondArg);
12370 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12371 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12374 ExtMask2[Idx] = SV2->getMaskValue(
I);
12378 ->getNumElements(),
12379 ExtMask2, UseMask::SecondArg);
12380 if (SV1->getOperand(0)->getType() ==
12381 SV2->getOperand(0)->getType() &&
12382 SV1->getOperand(0)->getType() != SV1->getType() &&
12385 Op1 = SV1->getOperand(0);
12386 Op2 = SV2->getOperand(0);
12387 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12388 int LocalVF = ShuffleMask1.size();
12390 LocalVF = FTy->getNumElements();
12391 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12392 CombinedMask1.swap(ShuffleMask1);
12393 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12394 LocalVF = ShuffleMask2.size();
12396 LocalVF = FTy->getNumElements();
12397 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12398 CombinedMask2.swap(ShuffleMask2);
12401 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12402 Builder.resizeToMatch(Op1, Op2);
12404 ->getElementCount()
12405 .getKnownMinValue(),
12407 ->getElementCount()
12408 .getKnownMinValue());
12409 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12412 "Expected undefined mask element");
12413 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12422 return Builder.createIdentity(Op1);
12423 return Builder.createShuffleVector(
12428 return Builder.createPoison(
12430 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12431 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12434 return Builder.createShuffleVector(V1, NewMask);
12435 return Builder.createIdentity(V1);
12441 ArrayRef<int> Mask) {
12450static std::pair<InstructionCost, InstructionCost>
12461 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12470 ScalarCost =
TTI.getPointersChainCost(
12471 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12475 for (
Value *V : Ptrs) {
12476 if (V == BasePtr) {
12485 if (!
Ptr || !
Ptr->hasOneUse())
12489 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12494 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12495 TTI::PointersChainInfo::getKnownStride(),
12505 [](
const Value *V) {
12507 return Ptr && !
Ptr->hasAllConstantIndices();
12509 ? TTI::PointersChainInfo::getUnknownStride()
12510 : TTI::PointersChainInfo::getKnownStride();
12513 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
12517 if (It != Ptrs.
end())
12522 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
12523 BaseGEP->getPointerOperand(), Indices, VecTy,
12528 return std::make_pair(ScalarCost, VecCost);
12531void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12532 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12533 "Expected gather node without reordering.");
12535 SmallSet<size_t, 2> LoadKeyUsed;
12539 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12544 return VectorizableTree[Idx]->isSame(TE.Scalars);
12548 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
12553 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
12554 if (LIt != LoadsMap.
end()) {
12555 for (LoadInst *RLI : LIt->second) {
12557 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
12561 for (LoadInst *RLI : LIt->second) {
12563 LI->getPointerOperand(), *TLI)) {
12568 if (LIt->second.size() > 2) {
12570 hash_value(LIt->second.back()->getPointerOperand());
12579 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12580 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12581 bool IsOrdered =
true;
12582 unsigned NumInstructions = 0;
12586 size_t Key = 1, Idx = 1;
12594 auto &Container = SortedValues[
Key];
12595 if (IsOrdered && !KeyToIndex.
contains(V) &&
12598 ((Container.contains(Idx) &&
12599 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
12600 (!Container.empty() && !Container.contains(Idx) &&
12601 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12603 auto &KTI = KeyToIndex[
V];
12605 Container[Idx].push_back(V);
12610 if (!IsOrdered && NumInstructions > 1) {
12612 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12613 for (
const auto &
D : SortedValues) {
12614 for (
const auto &
P :
D.second) {
12616 for (
Value *V :
P.second) {
12617 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
12618 for (
auto [K, Idx] :
enumerate(Indices)) {
12619 TE.ReorderIndices[Cnt +
K] = Idx;
12620 TE.Scalars[Cnt +
K] =
V;
12622 Sz += Indices.
size();
12623 Cnt += Indices.
size();
12627 *TTI,
TE.Scalars.front()->getType(), Sz);
12631 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12639 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12644 auto *ScalarTy =
TE.Scalars.front()->getType();
12646 for (
auto [Idx, Sz] : SubVectors) {
12653 int Sz =
TE.Scalars.size();
12654 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
12655 TE.ReorderIndices.end());
12661 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12665 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12668 VecTy, ReorderMask);
12674 DemandedElts.clearBit(
I);
12676 ReorderMask[
I] =
I;
12678 ReorderMask[
I] =
I + Sz;
12684 if (!DemandedElts.isAllOnes())
12686 if (
Cost >= BVCost) {
12687 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
12689 TE.ReorderIndices.clear();
12696 const InstructionsState &S,
12702 return V->getType()->getScalarType()->isFloatingPointTy();
12704 "Can only convert to FMA for floating point types");
12705 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12710 for (
Value *V : VL) {
12714 if (S.isCopyableElement(
I))
12716 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12717 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12720 FMF &= FPCI->getFastMathFlags();
12724 if (!CheckForContractable(VL))
12727 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12734 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12736 if (!CheckForContractable(
Operands.front()))
12744 for (
Value *V : VL) {
12748 if (!S.isCopyableElement(
I))
12750 FMF &= FPCI->getFastMathFlags();
12751 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12755 if (S.isCopyableElement(V))
12758 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
12760 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
12767 FMF &= FPCI->getFastMathFlags();
12768 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12778 BaseGraphSize = VectorizableTree.size();
12780 class GraphTransformModeRAAI {
12781 bool &SavedIsGraphTransformMode;
12784 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
12785 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12786 IsGraphTransformMode =
true;
12788 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
12789 } TransformContext(IsGraphTransformMode);
12798 const InstructionsState &S) {
12802 I2->getOperand(
Op));
12804 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
12806 [](
const std::pair<Value *, Value *> &
P) {
12816 TreeEntry &E = *VectorizableTree[Idx];
12818 reorderGatherNode(E);
12823 constexpr unsigned VFLimit = 16;
12824 bool ForceLoadGather =
12825 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12826 return TE->isGather() && TE->hasState() &&
12827 TE->getOpcode() == Instruction::Load &&
12828 TE->getVectorFactor() < VFLimit;
12834 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
12843 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
12844 if (E.hasState()) {
12846 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12847 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12848 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12849 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12850 return is_contained(TEs, TE);
12857 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12858 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12859 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12860 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12861 return is_contained(TEs, TE);
12869 if (It != E.Scalars.end()) {
12871 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12872 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12873 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12874 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12875 return is_contained(TEs, TE);
12885 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
12886 TreeEntry &
E = *VectorizableTree[Idx];
12887 if (
E.isGather()) {
12890 unsigned MinVF =
getMinVF(2 * Sz);
12893 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
12894 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
12900 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
12903 if (CheckForSameVectorNodes(
E))
12907 unsigned StartIdx = 0;
12908 unsigned End = VL.
size();
12910 *TTI, VL.
front()->getType(), VL.
size() - 1);
12912 *TTI, VL.
front()->getType(), VF - 1)) {
12913 if (StartIdx + VF > End)
12916 bool AllStrided =
true;
12917 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
12922 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
12929 bool IsSplat =
isSplat(Slice);
12930 bool IsTwoRegisterSplat =
true;
12931 if (IsSplat && VF == 2) {
12934 IsTwoRegisterSplat = NumRegs2VF == 2;
12936 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
12944 (S.getOpcode() == Instruction::Load &&
12946 (S.getOpcode() != Instruction::Load &&
12952 if ((!UserIgnoreList ||
E.Idx != 0) &&
12953 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
12962 if (S.getOpcode() == Instruction::Load) {
12965 StridedPtrInfo SPtrInfo;
12967 PointerOps, SPtrInfo);
12978 if (UserIgnoreList &&
E.Idx == 0)
12983 }
else if (S.getOpcode() == Instruction::ExtractElement ||
12984 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
12986 !CheckOperandsProfitability(
13003 if (VF == 2 && AllStrided && Slices.
size() > 2)
13005 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13006 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13007 if (StartIdx == Cnt)
13008 StartIdx = Cnt + Sz;
13009 if (End == Cnt + Sz)
13012 for (
auto [Cnt, Sz] : Slices) {
13014 const TreeEntry *SameTE =
nullptr;
13016 It != Slice.
end()) {
13018 SameTE = getSameValuesTreeEntry(*It, Slice);
13020 unsigned PrevSize = VectorizableTree.size();
13021 [[maybe_unused]]
unsigned PrevEntriesSize =
13022 LoadEntriesToVectorize.size();
13023 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13024 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13025 VectorizableTree[PrevSize]->isGather() &&
13026 VectorizableTree[PrevSize]->hasState() &&
13027 VectorizableTree[PrevSize]->getOpcode() !=
13028 Instruction::ExtractElement &&
13030 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13032 VectorizableTree.pop_back();
13033 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13034 "LoadEntriesToVectorize expected to remain the same");
13037 AddCombinedNode(PrevSize, Cnt, Sz);
13041 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13042 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13044 E.ReorderIndices.clear();
13049 switch (
E.getOpcode()) {
13050 case Instruction::Load: {
13053 if (
E.State != TreeEntry::Vectorize)
13055 Type *ScalarTy =
E.getMainOp()->getType();
13061 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13062 SmallVector<int>
Mask;
13066 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13067 BaseLI->getPointerAddressSpace(),
CostKind,
13071 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13072 false, CommonAlignment,
CostKind, BaseLI);
13077 ->getPointerOperand()
13079 StridedPtrInfo SPtrInfo;
13080 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13081 SPtrInfo.Ty = VecTy;
13082 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13083 E.State = TreeEntry::StridedVectorize;
13088 case Instruction::Store: {
13096 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13097 SmallVector<int>
Mask;
13101 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13102 BaseSI->getPointerAddressSpace(),
CostKind,
13106 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13107 false, CommonAlignment,
CostKind, BaseSI);
13108 if (StridedCost < OriginalVecCost)
13111 E.State = TreeEntry::StridedVectorize;
13112 }
else if (!
E.ReorderIndices.empty()) {
13114 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13116 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13117 if (
Mask.size() < 4)
13121 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13122 TTI.isLegalInterleavedAccessType(
13123 VecTy, Factor, BaseSI->getAlign(),
13124 BaseSI->getPointerAddressSpace()))
13130 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13131 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13132 if (InterleaveFactor != 0)
13133 E.setInterleave(InterleaveFactor);
13137 case Instruction::Select: {
13138 if (
E.State != TreeEntry::Vectorize)
13144 E.CombinedOp = TreeEntry::MinMax;
13145 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13146 if (SelectOnly && CondEntry->UserTreeIndex &&
13147 CondEntry->State == TreeEntry::Vectorize) {
13149 CondEntry->State = TreeEntry::CombinedVectorize;
13153 case Instruction::FSub:
13154 case Instruction::FAdd: {
13156 if (
E.State != TreeEntry::Vectorize ||
13157 !
E.getOperations().isAddSubLikeOp())
13163 E.CombinedOp = TreeEntry::FMulAdd;
13164 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13165 if (FMulEntry->UserTreeIndex &&
13166 FMulEntry->State == TreeEntry::Vectorize) {
13168 FMulEntry->State = TreeEntry::CombinedVectorize;
13177 if (LoadEntriesToVectorize.empty()) {
13179 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13180 VectorizableTree.front()->getOpcode() == Instruction::Load)
13183 constexpr unsigned SmallTree = 3;
13184 constexpr unsigned SmallVF = 2;
13185 if ((VectorizableTree.size() <= SmallTree &&
13186 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13187 (VectorizableTree.size() <= 2 && UserIgnoreList))
13190 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13194 [](
const std::unique_ptr<TreeEntry> &TE) {
13195 return TE->isGather() &&
TE->hasState() &&
13196 TE->getOpcode() == Instruction::Load &&
13204 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13208 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13209 TreeEntry &
E = *
TE;
13210 if (
E.isGather() &&
13211 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13212 (!
E.hasState() &&
any_of(
E.Scalars,
13214 return isa<LoadInst>(V) &&
13215 !isVectorized(V) &&
13216 !isDeleted(cast<Instruction>(V));
13219 for (
Value *V :
E.Scalars) {
13226 *
this, V, *DL, *SE, *TTI,
13227 GatheredLoads[std::make_tuple(
13235 if (!GatheredLoads.
empty())
13236 tryToVectorizeGatheredLoads(GatheredLoads);
13246 bool IsFinalized =
false;
13259 bool SameNodesEstimated =
true;
13262 if (Ty->getScalarType()->isPointerTy()) {
13266 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13267 Ty->getScalarType());
13285 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13288 count(VL, *It) > 1 &&
13290 if (!NeedShuffle) {
13293 return TTI.getShuffleCost(
13298 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13299 CostKind, std::distance(VL.
begin(), It),
13305 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13308 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13312 VecTy, ShuffleMask, CostKind,
13316 return GatherCost +
13319 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13327 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13328 unsigned NumParts) {
13329 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13331 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13332 auto *EE = dyn_cast<ExtractElementInst>(V);
13335 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13338 return std::max(Sz, VecTy->getNumElements());
13345 -> std::optional<TTI::ShuffleKind> {
13346 if (NumElts <= EltsPerVector)
13347 return std::nullopt;
13349 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13351 if (I == PoisonMaskElem)
13353 return std::min(S, I);
13356 int OffsetReg1 = OffsetReg0;
13360 int FirstRegId = -1;
13361 Indices.assign(1, OffsetReg0);
13365 int Idx =
I - OffsetReg0;
13367 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13368 if (FirstRegId < 0)
13369 FirstRegId = RegId;
13370 RegIndices.
insert(RegId);
13371 if (RegIndices.
size() > 2)
13372 return std::nullopt;
13373 if (RegIndices.
size() == 2) {
13375 if (Indices.
size() == 1) {
13378 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13379 [&](
int S,
int I) {
13380 if (I == PoisonMaskElem)
13382 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13383 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13384 if (RegId == FirstRegId)
13386 return std::min(S, I);
13389 unsigned Index = OffsetReg1 % NumElts;
13390 Indices.push_back(Index);
13391 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13393 Idx =
I - OffsetReg1;
13395 I = (Idx % NumElts) % EltsPerVector +
13396 (RegId == FirstRegId ? 0 : EltsPerVector);
13398 return ShuffleKind;
13406 if (!ShuffleKinds[Part])
13409 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13414 std::optional<TTI::ShuffleKind> RegShuffleKind =
13415 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13416 if (!RegShuffleKind) {
13419 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13432 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13433 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13434 assert((Idx + SubVecSize) <= BaseVF &&
13435 "SK_ExtractSubvector index out of range");
13445 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13446 if (OriginalCost < Cost)
13447 Cost = OriginalCost;
13454 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13456 unsigned SliceSize) {
13457 if (SameNodesEstimated) {
13463 if ((InVectors.size() == 2 &&
13467 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13470 "Expected all poisoned elements.");
13472 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13477 Cost += createShuffle(InVectors.front(),
13478 InVectors.size() == 1 ?
nullptr : InVectors.back(),
13480 transformMaskAfterShuffle(CommonMask, CommonMask);
13481 }
else if (InVectors.size() == 2) {
13482 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13483 transformMaskAfterShuffle(CommonMask, CommonMask);
13485 SameNodesEstimated =
false;
13486 if (!E2 && InVectors.size() == 1) {
13487 unsigned VF = E1.getVectorFactor();
13489 VF = std::max(VF, getVF(V1));
13492 VF = std::max(VF, E->getVectorFactor());
13494 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13496 CommonMask[Idx] = Mask[Idx] + VF;
13497 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13498 transformMaskAfterShuffle(CommonMask, CommonMask);
13500 auto P = InVectors.front();
13501 Cost += createShuffle(&E1, E2, Mask);
13502 unsigned VF = Mask.size();
13508 VF = std::max(VF, E->getVectorFactor());
13510 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13512 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13513 Cost += createShuffle(
P, InVectors.front(), CommonMask);
13514 transformMaskAfterShuffle(CommonMask, CommonMask);
13518 class ShuffleCostBuilder {
13521 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13523 return Mask.empty() ||
13524 (VF == Mask.size() &&
13532 ~ShuffleCostBuilder() =
default;
13538 if (isEmptyOrIdentity(Mask, VF))
13547 if (isEmptyOrIdentity(Mask, VF))
13556 void resizeToMatch(
Value *&,
Value *&)
const {}
13566 ShuffleCostBuilder Builder(TTI);
13569 unsigned CommonVF = Mask.size();
13571 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13575 Type *EScalarTy = E.Scalars.front()->getType();
13576 bool IsSigned =
true;
13577 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13579 IsSigned = It->second.second;
13581 if (EScalarTy != ScalarTy) {
13582 unsigned CastOpcode = Instruction::Trunc;
13583 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13584 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13586 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13587 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
13597 Type *EScalarTy = VecTy->getElementType();
13598 if (EScalarTy != ScalarTy) {
13600 unsigned CastOpcode = Instruction::Trunc;
13601 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13602 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13604 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13605 return TTI.getCastInstrCost(
13611 if (!V1 && !V2 && !P2.
isNull()) {
13614 unsigned VF = E->getVectorFactor();
13616 CommonVF = std::max(VF, E2->getVectorFactor());
13619 return Idx < 2 * static_cast<int>(CommonVF);
13621 "All elements in mask must be less than 2 * CommonVF.");
13622 if (E->Scalars.size() == E2->Scalars.size()) {
13626 for (
int &Idx : CommonMask) {
13629 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13631 else if (Idx >=
static_cast<int>(CommonVF))
13632 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13636 CommonVF = E->Scalars.size();
13637 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13638 GetNodeMinBWAffectedCost(*E2, CommonVF);
13640 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13641 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13644 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13645 }
else if (!V1 && P2.
isNull()) {
13648 unsigned VF = E->getVectorFactor();
13652 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13653 "All elements in mask must be less than CommonVF.");
13654 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13656 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13657 for (
int &Idx : CommonMask) {
13661 CommonVF = E->Scalars.size();
13662 }
else if (
unsigned Factor = E->getInterleaveFactor();
13663 Factor > 0 && E->Scalars.size() != Mask.size() &&
13667 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13669 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13672 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13673 CommonVF == CommonMask.size() &&
13675 [](
const auto &&
P) {
13677 static_cast<unsigned>(
P.value()) !=
P.index();
13685 }
else if (V1 && P2.
isNull()) {
13687 ExtraCost += GetValueMinBWAffectedCost(V1);
13688 CommonVF = getVF(V1);
13691 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13692 "All elements in mask must be less than CommonVF.");
13693 }
else if (V1 && !V2) {
13695 unsigned VF = getVF(V1);
13697 CommonVF = std::max(VF, E2->getVectorFactor());
13700 return Idx < 2 * static_cast<int>(CommonVF);
13702 "All elements in mask must be less than 2 * CommonVF.");
13703 if (E2->Scalars.size() == VF && VF != CommonVF) {
13705 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13706 for (
int &Idx : CommonMask) {
13709 if (Idx >=
static_cast<int>(CommonVF))
13710 Idx = E2Mask[Idx - CommonVF] + VF;
13714 ExtraCost += GetValueMinBWAffectedCost(V1);
13716 ExtraCost += GetNodeMinBWAffectedCost(
13717 *E2, std::min(CommonVF, E2->getVectorFactor()));
13718 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13719 }
else if (!V1 && V2) {
13721 unsigned VF = getVF(V2);
13723 CommonVF = std::max(VF, E1->getVectorFactor());
13726 return Idx < 2 * static_cast<int>(CommonVF);
13728 "All elements in mask must be less than 2 * CommonVF.");
13729 if (E1->Scalars.size() == VF && VF != CommonVF) {
13731 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
13732 for (
int &Idx : CommonMask) {
13735 if (Idx >=
static_cast<int>(CommonVF))
13736 Idx = E1Mask[Idx - CommonVF] + VF;
13742 ExtraCost += GetNodeMinBWAffectedCost(
13743 *E1, std::min(CommonVF, E1->getVectorFactor()));
13745 ExtraCost += GetValueMinBWAffectedCost(V2);
13746 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13748 assert(V1 && V2 &&
"Expected both vectors.");
13749 unsigned VF = getVF(V1);
13750 CommonVF = std::max(VF, getVF(V2));
13753 return Idx < 2 * static_cast<int>(CommonVF);
13755 "All elements in mask must be less than 2 * CommonVF.");
13757 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13760 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13765 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13768 InVectors.front() =
13770 if (InVectors.size() == 2)
13771 InVectors.pop_back();
13772 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13773 V1, V2, CommonMask, Builder, ScalarTy);
13780 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13781 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13782 CheckedExtracts(CheckedExtracts) {}
13784 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13785 unsigned NumParts,
bool &UseVecBaseAsInput) {
13786 UseVecBaseAsInput =
false;
13789 Value *VecBase =
nullptr;
13791 if (!E->ReorderIndices.empty()) {
13793 E->ReorderIndices.end());
13798 bool PrevNodeFound =
any_of(
13799 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13800 [&](
const std::unique_ptr<TreeEntry> &TE) {
13801 return ((TE->hasState() && !TE->isAltShuffle() &&
13802 TE->getOpcode() == Instruction::ExtractElement) ||
13804 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13805 return VL.size() > Data.index() &&
13806 (Mask[Data.index()] == PoisonMaskElem ||
13807 isa<UndefValue>(VL[Data.index()]) ||
13808 Data.value() == VL[Data.index()]);
13816 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13830 VecBase = EE->getVectorOperand();
13831 UniqueBases.
insert(VecBase);
13833 if (!CheckedExtracts.
insert(V).second ||
13837 return isa<GetElementPtrInst>(U) &&
13838 !R.areAllUsersVectorized(cast<Instruction>(U),
13846 unsigned Idx = *EEIdx;
13848 if (EE->hasOneUse() || !PrevNodeFound) {
13854 Cost -= TTI.getExtractWithExtendCost(
13855 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13858 Cost += TTI.getCastInstrCost(
13859 Ext->getOpcode(), Ext->getType(), EE->getType(),
13864 APInt &DemandedElts =
13865 VectorOpsToExtracts
13868 .first->getSecond();
13869 DemandedElts.
setBit(Idx);
13872 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13874 DemandedElts,
false,
13882 if (!PrevNodeFound)
13883 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13886 transformMaskAfterShuffle(CommonMask, CommonMask);
13887 SameNodesEstimated =
false;
13888 if (NumParts != 1 && UniqueBases.
size() != 1) {
13889 UseVecBaseAsInput =
true;
13897 std::optional<InstructionCost>
13901 return std::nullopt;
13905 IsFinalized =
false;
13906 CommonMask.clear();
13909 VectorizedVals.clear();
13910 SameNodesEstimated =
true;
13916 return Idx < static_cast<int>(E1.getVectorFactor());
13918 "Expected single vector shuffle mask.");
13922 if (InVectors.empty()) {
13923 CommonMask.assign(Mask.begin(), Mask.end());
13924 InVectors.assign({&E1, &E2});
13927 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
13933 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13934 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13937 if (InVectors.empty()) {
13938 CommonMask.assign(Mask.begin(), Mask.end());
13939 InVectors.assign(1, &E1);
13942 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
13948 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13949 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
13950 if (!SameNodesEstimated && InVectors.size() == 1)
13951 InVectors.emplace_back(&E1);
13957 assert(InVectors.size() == 1 &&
13964 ->getOrdered(
P.index()));
13965 return EI->getVectorOperand() == V1 ||
13966 EI->getVectorOperand() == V2;
13968 "Expected extractelement vectors.");
13972 if (InVectors.empty()) {
13973 assert(CommonMask.empty() && !ForExtracts &&
13974 "Expected empty input mask/vectors.");
13975 CommonMask.assign(Mask.begin(), Mask.end());
13976 InVectors.assign(1, V1);
13982 !CommonMask.empty() &&
13986 ->getOrdered(
P.index());
13988 return P.value() == Mask[
P.index()] ||
13993 return EI->getVectorOperand() == V1;
13995 "Expected only tree entry for extractelement vectors.");
13998 assert(!InVectors.empty() && !CommonMask.empty() &&
13999 "Expected only tree entries from extracts/reused buildvectors.");
14000 unsigned VF = getVF(V1);
14001 if (InVectors.size() == 2) {
14002 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14003 transformMaskAfterShuffle(CommonMask, CommonMask);
14004 VF = std::max<unsigned>(VF, CommonMask.size());
14005 }
else if (
const auto *InTE =
14006 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14007 VF = std::max(VF, InTE->getVectorFactor());
14011 ->getNumElements());
14013 InVectors.push_back(V1);
14014 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14016 CommonMask[Idx] = Mask[Idx] + VF;
14019 Value *Root =
nullptr) {
14020 Cost += getBuildVectorCost(VL, Root);
14024 unsigned VF = VL.
size();
14026 VF = std::min(VF, MaskVF);
14027 Type *VLScalarTy = VL.
front()->getType();
14051 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14057 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14062 IsFinalized =
true;
14065 if (InVectors.
size() == 2)
14066 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14068 Cost += createShuffle(Vec,
nullptr, CommonMask);
14069 transformMaskAfterShuffle(CommonMask, CommonMask);
14071 "Expected vector length for the final value before action.");
14074 Cost += createShuffle(V1, V2, Mask);
14077 InVectors.
front() = V;
14079 if (!SubVectors.empty()) {
14081 if (InVectors.
size() == 2)
14082 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14084 Cost += createShuffle(Vec,
nullptr, CommonMask);
14085 transformMaskAfterShuffle(CommonMask, CommonMask);
14087 if (!SubVectorsMask.
empty()) {
14089 "Expected same size of masks for subvectors and common mask.");
14091 copy(SubVectorsMask, SVMask.begin());
14092 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14095 I1 = I2 + CommonMask.
size();
14102 for (
auto [
E, Idx] : SubVectors) {
14103 Type *EScalarTy =
E->Scalars.front()->getType();
14104 bool IsSigned =
true;
14105 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14108 IsSigned = It->second.second;
14110 if (ScalarTy != EScalarTy) {
14111 unsigned CastOpcode = Instruction::Trunc;
14112 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14113 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14115 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14116 Cost += TTI.getCastInstrCost(
14125 if (!CommonMask.
empty()) {
14126 std::iota(std::next(CommonMask.
begin(), Idx),
14127 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14133 if (!ExtMask.
empty()) {
14134 if (CommonMask.
empty()) {
14138 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14141 NewMask[
I] = CommonMask[ExtMask[
I]];
14143 CommonMask.
swap(NewMask);
14146 if (CommonMask.
empty()) {
14147 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14151 createShuffle(InVectors.
front(),
14152 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14157 assert((IsFinalized || CommonMask.empty()) &&
14158 "Shuffle construction must be finalized.");
14162const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14163 unsigned Idx)
const {
14164 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14165 assert(
Op->isSame(E->getOperand(Idx)) &&
"Operands mismatch!");
14170 if (TE.State == TreeEntry::ScatterVectorize ||
14171 TE.State == TreeEntry::StridedVectorize)
14173 if (TE.State == TreeEntry::CompressVectorize)
14175 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14176 !TE.isAltShuffle()) {
14177 if (TE.ReorderIndices.empty())
14189 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14199 auto It = MinBWs.find(
E);
14200 Type *OrigScalarTy = ScalarTy;
14201 if (It != MinBWs.end()) {
14208 unsigned EntryVF =
E->getVectorFactor();
14211 if (
E->isGather()) {
14217 ScalarTy = VL.
front()->getType();
14218 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14219 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14221 if (
E->State == TreeEntry::SplitVectorize) {
14222 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14223 "Expected exactly 2 combined entries.");
14224 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14226 if (
E->ReorderIndices.empty()) {
14229 E->CombinedEntriesWithIndices.back().second,
14232 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14233 ->getVectorFactor()));
14235 unsigned CommonVF =
14236 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14237 ->getVectorFactor(),
14238 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14239 ->getVectorFactor());
14244 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14248 SmallVector<int>
Mask;
14249 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14250 (
E->State != TreeEntry::StridedVectorize ||
14252 SmallVector<int> NewMask;
14253 if (
E->getOpcode() == Instruction::Store) {
14255 NewMask.
resize(
E->ReorderIndices.size());
14262 if (!
E->ReuseShuffleIndices.empty())
14267 assert((
E->State == TreeEntry::Vectorize ||
14268 E->State == TreeEntry::ScatterVectorize ||
14269 E->State == TreeEntry::StridedVectorize ||
14270 E->State == TreeEntry::CompressVectorize) &&
14271 "Unhandled state");
14274 (
E->getOpcode() == Instruction::GetElementPtr &&
14275 E->getMainOp()->getType()->isPointerTy()) ||
14276 E->hasCopyableElements()) &&
14279 unsigned ShuffleOrOp =
14280 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14281 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14282 ShuffleOrOp =
E->CombinedOp;
14283 SmallSetVector<Value *, 16> UniqueValues(VL.
begin(), VL.
end());
14284 const unsigned Sz = UniqueValues.size();
14285 SmallBitVector UsedScalars(Sz,
false);
14286 for (
unsigned I = 0;
I < Sz; ++
I) {
14288 !
E->isCopyableElement(UniqueValues[
I]) &&
14289 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14291 UsedScalars.set(
I);
14293 auto GetCastContextHint = [&](
Value *
V) {
14295 return getCastContextHint(*OpTEs.front());
14296 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14297 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14298 !SrcState.isAltShuffle())
14311 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14313 for (
unsigned I = 0;
I < Sz; ++
I) {
14314 if (UsedScalars.test(
I))
14316 ScalarCost += ScalarEltCost(
I);
14325 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14327 if (!EI.UserTE->hasState() ||
14328 EI.UserTE->getOpcode() != Instruction::Select ||
14330 auto UserBWIt = MinBWs.find(EI.UserTE);
14331 Type *UserScalarTy =
14332 (EI.UserTE->isGather() ||
14333 EI.UserTE->State == TreeEntry::SplitVectorize)
14334 ? EI.UserTE->Scalars.front()->getType()
14335 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14336 if (UserBWIt != MinBWs.end())
14338 UserBWIt->second.first);
14339 if (ScalarTy != UserScalarTy) {
14340 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14341 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14342 unsigned VecOpcode;
14344 if (BWSz > SrcBWSz)
14345 VecOpcode = Instruction::Trunc;
14348 It->second.second ? Instruction::SExt : Instruction::ZExt;
14350 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14355 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14356 ScalarCost,
"Calculated costs for Tree"));
14357 return VecCost - ScalarCost;
14362 assert((
E->State == TreeEntry::Vectorize ||
14363 E->State == TreeEntry::StridedVectorize ||
14364 E->State == TreeEntry::CompressVectorize) &&
14365 "Entry state expected to be Vectorize, StridedVectorize or "
14366 "MaskedLoadCompressVectorize here.");
14370 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14371 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14372 "Calculated GEPs cost for Tree"));
14374 return VecCost - ScalarCost;
14381 Type *CanonicalType = Ty;
14387 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14388 {CanonicalType, CanonicalType});
14390 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14393 if (VI && SelectOnly) {
14395 "Expected only for scalar type.");
14398 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14399 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14400 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14404 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14409 switch (ShuffleOrOp) {
14410 case Instruction::PHI: {
14413 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14414 for (
Value *V : UniqueValues) {
14420 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14424 if (
const TreeEntry *OpTE =
14426 if (CountedOps.
insert(OpTE).second &&
14427 !OpTE->ReuseShuffleIndices.empty())
14428 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14429 OpTE->Scalars.size());
14432 return CommonCost - ScalarCost;
14434 case Instruction::ExtractValue:
14435 case Instruction::ExtractElement: {
14436 APInt DemandedElts;
14438 auto GetScalarCost = [&](
unsigned Idx) {
14444 if (ShuffleOrOp == Instruction::ExtractElement) {
14446 SrcVecTy = EE->getVectorOperandType();
14449 Type *AggregateTy = EV->getAggregateOperand()->getType();
14452 NumElts = ATy->getNumElements();
14458 if (
I->hasOneUse()) {
14468 Cost -= TTI->getCastInstrCost(
14469 Ext->getOpcode(),
Ext->getType(),
I->getType(),
14474 if (DemandedElts.
isZero())
14480 return CommonCost - (DemandedElts.
isZero()
14482 : TTI.getScalarizationOverhead(
14483 SrcVecTy, DemandedElts,
false,
14486 return GetCostDiff(GetScalarCost, GetVectorCost);
14488 case Instruction::InsertElement: {
14489 assert(
E->ReuseShuffleIndices.empty() &&
14490 "Unique insertelements only are expected.");
14492 unsigned const NumElts = SrcVecTy->getNumElements();
14493 unsigned const NumScalars = VL.
size();
14499 unsigned OffsetEnd = OffsetBeg;
14500 InsertMask[OffsetBeg] = 0;
14503 if (OffsetBeg > Idx)
14505 else if (OffsetEnd < Idx)
14507 InsertMask[Idx] =
I + 1;
14510 if (NumOfParts > 0 && NumOfParts < NumElts)
14511 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14512 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14514 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14515 unsigned InsertVecSz = std::min<unsigned>(
14517 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14518 bool IsWholeSubvector =
14519 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14523 if (OffsetBeg + InsertVecSz > VecSz) {
14526 InsertVecSz = VecSz;
14531 SmallVector<int>
Mask;
14532 if (!
E->ReorderIndices.empty()) {
14537 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14539 bool IsIdentity =
true;
14541 Mask.swap(PrevMask);
14542 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14544 DemandedElts.
setBit(InsertIdx);
14545 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14546 Mask[InsertIdx - OffsetBeg] =
I;
14548 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14562 InsertVecTy, Mask);
14564 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14570 SmallBitVector InMask =
14572 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14573 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14574 if (InsertVecSz != VecSz) {
14579 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
14581 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
14585 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14594 case Instruction::ZExt:
14595 case Instruction::SExt:
14596 case Instruction::FPToUI:
14597 case Instruction::FPToSI:
14598 case Instruction::FPExt:
14599 case Instruction::PtrToInt:
14600 case Instruction::IntToPtr:
14601 case Instruction::SIToFP:
14602 case Instruction::UIToFP:
14603 case Instruction::Trunc:
14604 case Instruction::FPTrunc:
14605 case Instruction::BitCast: {
14606 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
14609 unsigned Opcode = ShuffleOrOp;
14610 unsigned VecOpcode = Opcode;
14612 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14614 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14615 if (SrcIt != MinBWs.end()) {
14616 SrcBWSz = SrcIt->second.first;
14622 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
14623 if (BWSz == SrcBWSz) {
14624 VecOpcode = Instruction::BitCast;
14625 }
else if (BWSz < SrcBWSz) {
14626 VecOpcode = Instruction::Trunc;
14627 }
else if (It != MinBWs.end()) {
14628 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14629 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14630 }
else if (SrcIt != MinBWs.end()) {
14631 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14633 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14635 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14636 !SrcIt->second.second) {
14637 VecOpcode = Instruction::UIToFP;
14640 assert(Idx == 0 &&
"Expected 0 index only");
14641 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
14648 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14650 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14653 bool IsArithmeticExtendedReduction =
14654 E->Idx == 0 && UserIgnoreList &&
14657 return is_contained({Instruction::Add, Instruction::FAdd,
14658 Instruction::Mul, Instruction::FMul,
14659 Instruction::And, Instruction::Or,
14663 if (IsArithmeticExtendedReduction &&
14664 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14666 return CommonCost +
14667 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
14668 VecOpcode == Opcode ? VI :
nullptr);
14670 return GetCostDiff(GetScalarCost, GetVectorCost);
14672 case Instruction::FCmp:
14673 case Instruction::ICmp:
14674 case Instruction::Select: {
14675 CmpPredicate VecPred, SwappedVecPred;
14678 match(VL0, MatchCmp))
14684 auto GetScalarCost = [&](
unsigned Idx) {
14694 !
match(VI, MatchCmp)) ||
14702 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14703 CostKind, getOperandInfo(
VI->getOperand(0)),
14704 getOperandInfo(
VI->getOperand(1)), VI);
14715 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
14716 CostKind, getOperandInfo(
E->getOperand(0)),
14717 getOperandInfo(
E->getOperand(1)), VL0);
14721 unsigned CondNumElements = CondType->getNumElements();
14723 assert(VecTyNumElements >= CondNumElements &&
14724 VecTyNumElements % CondNumElements == 0 &&
14725 "Cannot vectorize Instruction::Select");
14726 if (CondNumElements != VecTyNumElements) {
14735 return VecCost + CommonCost;
14737 return GetCostDiff(GetScalarCost, GetVectorCost);
14739 case TreeEntry::MinMax: {
14740 auto GetScalarCost = [&](
unsigned Idx) {
14741 return GetMinMaxCost(OrigScalarTy);
14745 return VecCost + CommonCost;
14747 return GetCostDiff(GetScalarCost, GetVectorCost);
14749 case TreeEntry::FMulAdd: {
14750 auto GetScalarCost = [&](
unsigned Idx) {
14753 return GetFMulAddCost(
E->getOperations(),
14759 for (
Value *V :
E->Scalars) {
14761 FMF &= FPCI->getFastMathFlags();
14763 FMF &= FPCIOp->getFastMathFlags();
14766 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14767 {VecTy, VecTy, VecTy}, FMF);
14769 return VecCost + CommonCost;
14771 return GetCostDiff(GetScalarCost, GetVectorCost);
14773 case Instruction::FNeg:
14774 case Instruction::Add:
14775 case Instruction::FAdd:
14776 case Instruction::Sub:
14777 case Instruction::FSub:
14778 case Instruction::Mul:
14779 case Instruction::FMul:
14780 case Instruction::UDiv:
14781 case Instruction::SDiv:
14782 case Instruction::FDiv:
14783 case Instruction::URem:
14784 case Instruction::SRem:
14785 case Instruction::FRem:
14786 case Instruction::Shl:
14787 case Instruction::LShr:
14788 case Instruction::AShr:
14789 case Instruction::And:
14790 case Instruction::Or:
14791 case Instruction::Xor: {
14792 auto GetScalarCost = [&](
unsigned Idx) {
14799 Value *Op1 =
E->getOperand(0)[Idx];
14801 SmallVector<const Value *, 2>
Operands(1, Op1);
14805 Op2 =
E->getOperand(1)[Idx];
14813 I && (ShuffleOrOp == Instruction::FAdd ||
14814 ShuffleOrOp == Instruction::FSub)) {
14822 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14827 return CI && CI->getValue().countr_one() >= It->second.first;
14835 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
14836 Op2Info, {},
nullptr, TLI) +
14839 return GetCostDiff(GetScalarCost, GetVectorCost);
14841 case Instruction::GetElementPtr: {
14842 return CommonCost + GetGEPCostDiff(VL, VL0);
14844 case Instruction::Load: {
14845 auto GetScalarCost = [&](
unsigned Idx) {
14847 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14848 VI->getAlign(),
VI->getPointerAddressSpace(),
14854 switch (
E->State) {
14855 case TreeEntry::Vectorize:
14856 if (
unsigned Factor =
E->getInterleaveFactor()) {
14857 VecLdCost = TTI->getInterleavedMemoryOpCost(
14858 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14859 LI0->getPointerAddressSpace(),
CostKind);
14862 VecLdCost = TTI->getMemoryOpCost(
14863 Instruction::Load, VecTy, LI0->getAlign(),
14867 case TreeEntry::StridedVectorize: {
14868 Align CommonAlignment =
14870 VecLdCost = TTI->getStridedMemoryOpCost(
14871 Instruction::Load, VecTy, LI0->getPointerOperand(),
14872 false, CommonAlignment,
CostKind);
14875 case TreeEntry::CompressVectorize: {
14877 unsigned InterleaveFactor;
14878 SmallVector<int> CompressMask;
14881 if (!
E->ReorderIndices.empty()) {
14882 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
14883 E->ReorderIndices.end());
14890 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
14891 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
14892 CompressMask, LoadVecTy);
14893 assert(IsVectorized &&
"Failed to vectorize load");
14894 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
14895 InterleaveFactor, IsMasked);
14896 Align CommonAlignment = LI0->getAlign();
14897 if (InterleaveFactor) {
14898 VecLdCost = TTI->getInterleavedMemoryOpCost(
14899 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14900 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
14901 }
else if (IsMasked) {
14902 VecLdCost = TTI->getMaskedMemoryOpCost(
14903 Instruction::Load, LoadVecTy, CommonAlignment,
14904 LI0->getPointerAddressSpace(),
CostKind);
14907 LoadVecTy, CompressMask,
CostKind);
14909 VecLdCost = TTI->getMemoryOpCost(
14910 Instruction::Load, LoadVecTy, CommonAlignment,
14914 LoadVecTy, CompressMask,
CostKind);
14918 case TreeEntry::ScatterVectorize: {
14919 Align CommonAlignment =
14921 VecLdCost = TTI->getGatherScatterOpCost(
14922 Instruction::Load, VecTy, LI0->getPointerOperand(),
14923 false, CommonAlignment,
CostKind);
14926 case TreeEntry::CombinedVectorize:
14927 case TreeEntry::SplitVectorize:
14928 case TreeEntry::NeedToGather:
14931 return VecLdCost + CommonCost;
14937 if (
E->State == TreeEntry::ScatterVectorize)
14944 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14946 case Instruction::Store: {
14947 bool IsReorder = !
E->ReorderIndices.empty();
14948 auto GetScalarCost = [=](
unsigned Idx) {
14951 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
14952 VI->getAlign(),
VI->getPointerAddressSpace(),
14960 if (
E->State == TreeEntry::StridedVectorize) {
14961 Align CommonAlignment =
14963 VecStCost = TTI->getStridedMemoryOpCost(
14964 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
14965 false, CommonAlignment,
CostKind);
14967 assert(
E->State == TreeEntry::Vectorize &&
14968 "Expected either strided or consecutive stores.");
14969 if (
unsigned Factor =
E->getInterleaveFactor()) {
14970 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
14971 "No reused shuffles expected");
14973 VecStCost = TTI->getInterleavedMemoryOpCost(
14974 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
14975 BaseSI->getPointerAddressSpace(),
CostKind);
14978 VecStCost = TTI->getMemoryOpCost(
14979 Instruction::Store, VecTy, BaseSI->getAlign(),
14980 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
14983 return VecStCost + CommonCost;
14987 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
14991 return GetCostDiff(GetScalarCost, GetVectorCost) +
14992 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
14994 case Instruction::Call: {
14995 auto GetScalarCost = [&](
unsigned Idx) {
14999 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15000 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15010 CI,
ID, VecTy->getNumElements(),
15011 It != MinBWs.end() ? It->second.first : 0, TTI);
15013 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15015 return GetCostDiff(GetScalarCost, GetVectorCost);
15017 case Instruction::ShuffleVector: {
15025 "Invalid Shuffle Vector Operand");
15028 auto TryFindNodeWithEqualOperands = [=]() {
15029 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15032 if (
TE->hasState() &&
TE->isAltShuffle() &&
15033 ((
TE->getOpcode() ==
E->getOpcode() &&
15034 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15035 (
TE->getOpcode() ==
E->getAltOpcode() &&
15036 TE->getAltOpcode() ==
E->getOpcode())) &&
15037 TE->hasEqualOperands(*
E))
15042 auto GetScalarCost = [&](
unsigned Idx) {
15047 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15048 "Unexpected main/alternate opcode");
15050 return TTI->getInstructionCost(VI,
CostKind);
15058 if (TryFindNodeWithEqualOperands()) {
15060 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15067 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15069 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15072 VecCost = TTIRef.getCmpSelInstrCost(
15073 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15074 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15076 VecCost += TTIRef.getCmpSelInstrCost(
15077 E->getOpcode(), VecTy, MaskTy,
15079 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15082 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15085 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15086 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15088 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15089 if (SrcIt != MinBWs.end()) {
15090 SrcBWSz = SrcIt->second.first;
15094 if (BWSz <= SrcBWSz) {
15095 if (BWSz < SrcBWSz)
15097 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15101 <<
"SLP: alternate extension, which should be truncated.\n";
15107 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15110 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15113 SmallVector<int>
Mask;
15114 E->buildAltOpShuffleMask(
15115 [&](Instruction *
I) {
15116 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15117 "Unexpected main/alternate opcode");
15128 unsigned Opcode0 =
E->getOpcode();
15129 unsigned Opcode1 =
E->getAltOpcode();
15130 SmallBitVector OpcodeMask(
15134 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15136 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15137 return AltVecCost < VecCost ? AltVecCost : VecCost;
15143 return GetCostDiff(
15148 "Not supported shufflevector usage.");
15150 unsigned SVNumElements =
15152 ->getNumElements();
15153 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15154 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15159 "Not supported shufflevector usage.");
15162 [[maybe_unused]]
bool IsExtractSubvectorMask =
15163 SV->isExtractSubvectorMask(Index);
15164 assert(IsExtractSubvectorMask &&
15165 "Not supported shufflevector usage.");
15166 if (NextIndex != Index)
15168 NextIndex += SV->getShuffleMask().size();
15171 return ::getShuffleCost(
15177 return GetCostDiff(GetScalarCost, GetVectorCost);
15179 case Instruction::Freeze:
15186bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15188 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15190 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15191 SmallVector<int>
Mask;
15192 return TE->isGather() &&
15194 [
this](
Value *V) { return EphValues.contains(V); }) &&
15196 TE->Scalars.size() < Limit ||
15197 (((
TE->hasState() &&
15198 TE->getOpcode() == Instruction::ExtractElement) ||
15201 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15202 !
TE->isAltShuffle()) ||
15207 if (VectorizableTree.size() == 1 &&
15208 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15209 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15210 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15212 AreVectorizableGathers(VectorizableTree[0].
get(),
15213 VectorizableTree[0]->Scalars.size()) &&
15214 VectorizableTree[0]->getVectorFactor() > 2)))
15217 if (VectorizableTree.size() != 2)
15224 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15225 AreVectorizableGathers(VectorizableTree[1].
get(),
15226 VectorizableTree[0]->Scalars.size()))
15230 if (VectorizableTree[0]->
isGather() ||
15231 (VectorizableTree[1]->
isGather() &&
15232 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15233 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15234 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15242 bool MustMatchOrInst) {
15246 Value *ZextLoad = Root;
15247 const APInt *ShAmtC;
15248 bool FoundOr =
false;
15252 ShAmtC->
urem(8) == 0))) {
15254 ZextLoad = BinOp->getOperand(0);
15255 if (BinOp->getOpcode() == Instruction::Or)
15260 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15267 Type *SrcTy = Load->getType();
15268 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15274 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15284 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15285 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15293 unsigned NumElts = Stores.
size();
15294 for (
Value *Scalar : Stores) {
15308 if (VectorizableTree.empty()) {
15309 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15315 if (VectorizableTree.size() == 2 &&
15317 VectorizableTree[1]->isGather() &&
15318 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15319 !(
isSplat(VectorizableTree[1]->Scalars) ||
15327 constexpr int Limit = 4;
15329 !VectorizableTree.empty() &&
15330 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15331 return (TE->isGather() &&
15332 (!TE->hasState() ||
15333 TE->getOpcode() != Instruction::ExtractElement) &&
15335 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15342 VectorizableTree.size() <= Limit &&
15343 all_of(VectorizableTree,
15344 [&](
const std::unique_ptr<TreeEntry> &TE) {
15345 return (TE->isGather() &&
15346 (!TE->hasState() ||
15347 TE->getOpcode() != Instruction::ExtractElement) &&
15351 (TE->getOpcode() == Instruction::InsertElement ||
15352 (TE->getOpcode() == Instruction::PHI &&
15354 return isa<PoisonValue>(V) || MustGather.contains(V);
15357 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15358 return TE->State == TreeEntry::Vectorize &&
15359 TE->getOpcode() == Instruction::PHI;
15366 unsigned NumGathers = 0;
15367 constexpr int LimitTreeSize = 36;
15369 all_of(VectorizableTree,
15370 [&](
const std::unique_ptr<TreeEntry> &TE) {
15371 if (!TE->isGather() && TE->hasState() &&
15372 (TE->getOpcode() == Instruction::Load ||
15373 TE->getOpcode() == Instruction::Store)) {
15377 if (TE->isGather())
15379 return TE->State == TreeEntry::SplitVectorize ||
15380 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15381 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15382 VectorizableTree.size() > LimitTreeSize) ||
15386 (TE->getOpcode() == Instruction::PHI ||
15387 (TE->hasCopyableElements() &&
15390 TE->Scalars.size() / 2) ||
15391 ((!TE->ReuseShuffleIndices.empty() ||
15392 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15393 TE->Scalars.size() == 2)));
15395 (StoreLoadNodes.
empty() ||
15396 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15397 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15398 return TE->getOpcode() == Instruction::Store ||
15400 return !isa<LoadInst>(V) ||
15401 areAllUsersVectorized(cast<Instruction>(V));
15409 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15410 VectorizableTree.size() >= Limit &&
15412 [&](
const std::unique_ptr<TreeEntry> &TE) {
15413 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15414 TE->UserTreeIndex.UserTE->Idx == 0;
15421 VectorizableTree.size() > 2 &&
15422 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15423 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15424 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15425 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15427 ArrayRef(VectorizableTree).drop_front(2),
15428 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15438 if (isFullyVectorizableTinyTree(ForReduction))
15443 bool IsAllowedSingleBVNode =
15444 VectorizableTree.
size() > 1 ||
15445 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15446 !VectorizableTree.front()->isAltShuffle() &&
15447 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15448 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15450 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15451 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15452 return isa<ExtractElementInst, Constant>(V) ||
15453 (IsAllowedSingleBVNode &&
15454 !V->hasNUsesOrMore(UsesLimit) &&
15455 any_of(V->users(), IsaPred<InsertElementInst>));
15460 if (VectorizableTree.back()->isGather() &&
15461 VectorizableTree.back()->hasState() &&
15462 VectorizableTree.back()->isAltShuffle() &&
15463 VectorizableTree.back()->getVectorFactor() > 2 &&
15465 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15466 TTI->getScalarizationOverhead(
15467 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15468 VectorizableTree.back()->getVectorFactor()),
15481 constexpr unsigned SmallTree = 3;
15482 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15485 [](
const std::unique_ptr<TreeEntry> &TE) {
15486 return TE->isGather() && TE->hasState() &&
15487 TE->getOpcode() == Instruction::Load &&
15495 TreeEntry &E = *VectorizableTree[Idx];
15496 if (E.State == TreeEntry::SplitVectorize)
15500 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15519 const TreeEntry *Root = VectorizableTree.front().get();
15520 if (Root->isGather())
15528 for (
const auto &TEPtr : VectorizableTree) {
15529 if (!TEPtr->isGather()) {
15530 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15531 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15532 LastInstructions.
insert(LastInst);
15534 if (TEPtr->UserTreeIndex)
15535 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15542 if (
II->isAssumeLikeIntrinsic())
15549 return IntrCost < CallCost;
15556 CheckedInstructions;
15557 unsigned Budget = 0;
15558 const unsigned BudgetLimit =
15563 "Expected instructions in same block.");
15564 if (
auto It = CheckedInstructions.
find(
Last);
15565 It != CheckedInstructions.
end()) {
15566 const Instruction *Checked = It->second.getPointer();
15568 return It->second.getInt() != 0;
15574 ++
First->getIterator().getReverse(),
15576 Last->getIterator().getReverse();
15578 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15584 for (
const Instruction *LastInst : LastInstsInRange)
15585 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15588 if (LastInstructions.
contains(&*PrevInstIt))
15589 LastInstsInRange.
push_back(&*PrevInstIt);
15594 for (
const Instruction *LastInst : LastInstsInRange)
15596 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15597 Budget <= BudgetLimit ? 1 : 0);
15598 return Budget <= BudgetLimit;
15600 auto AddCosts = [&](
const TreeEntry *
Op) {
15601 Type *ScalarTy =
Op->Scalars.front()->getType();
15602 auto It = MinBWs.find(
Op);
15603 if (It != MinBWs.end())
15606 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15609 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15616 ParentOpParentToPreds;
15619 auto Key = std::make_pair(Root, OpParent);
15620 if (
auto It = ParentOpParentToPreds.
find(
Key);
15621 It != ParentOpParentToPreds.
end())
15633 for (
const auto &KeyPair : ParentsPairsToAdd) {
15635 "Should not have been added before.");
15639 while (!Worklist.
empty()) {
15641 if (BB == OpParent || !Visited.
insert(BB).second)
15643 auto Pair = std::make_pair(BB, OpParent);
15644 if (
auto It = ParentOpParentToPreds.
find(Pair);
15645 It != ParentOpParentToPreds.
end()) {
15649 ParentsPairsToAdd.
insert(Pair);
15654 if (Budget > BudgetLimit)
15666 while (!LiveEntries.
empty()) {
15671 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15674 if (!
Op->isGather())
15676 if (Entry->State == TreeEntry::SplitVectorize ||
15677 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15683 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15686 if (
Op->isGather()) {
15687 assert(Entry->getOpcode() == Instruction::PHI &&
15688 "Expected phi node only.");
15690 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15692 for (
Value *V :
Op->Scalars) {
15703 OpLastInst = EntriesToLastInstruction.
at(
Op);
15707 if (OpParent == Parent) {
15708 if (Entry->getOpcode() == Instruction::PHI) {
15709 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15713 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15719 if (Entry->getOpcode() != Instruction::PHI &&
15720 !CheckForNonVecCallsInSameBlock(
15721 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
15727 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15733 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15749 const auto *I1 = IE1;
15750 const auto *I2 = IE2;
15762 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15765 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
15768 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15775struct ValueSelect {
15776 template <
typename U>
15777 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
15780 template <
typename U>
15781 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
15799template <
typename T>
15805 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
15807 auto VMIt = std::next(ShuffleMask.begin());
15810 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15812 if (!IsBaseUndef.
all()) {
15814 std::pair<T *, bool> Res =
15815 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
15817 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15821 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15823 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
15824 assert((!V || GetVF(V) == Mask.size()) &&
15825 "Expected base vector of VF number of elements.");
15826 Prev = Action(Mask, {
nullptr, Res.first});
15827 }
else if (ShuffleMask.size() == 1) {
15830 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15836 Prev = Action(Mask, {ShuffleMask.begin()->first});
15840 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15841 unsigned Vec2VF = GetVF(VMIt->first);
15842 if (Vec1VF == Vec2VF) {
15846 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15849 Mask[
I] = SecMask[
I] + Vec1VF;
15852 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15855 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15857 std::pair<T *, bool> Res2 =
15858 ResizeAction(VMIt->first, VMIt->second,
false);
15860 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15867 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
15870 Prev = Action(Mask, {Res1.first, Res2.first});
15872 VMIt = std::next(VMIt);
15874 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
15876 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
15878 std::pair<T *, bool> Res =
15879 ResizeAction(VMIt->first, VMIt->second,
false);
15881 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15884 "Multiple uses of scalars.");
15885 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
15890 Prev = Action(Mask, {Prev, Res.first});
15898template <
typename T>
struct ShuffledInsertData {
15902 MapVector<T, SmallVector<int>> ValueMasks;
15910 << VectorizableTree.size() <<
".\n");
15913 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
15914 TreeEntry &TE = *VectorizableTree[
I];
15917 if (TE.State == TreeEntry::CombinedVectorize) {
15919 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
15920 << *TE.Scalars[0] <<
".\n";
15921 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15924 if (TE.hasState() &&
15925 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15926 if (
const TreeEntry *E =
15927 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15928 E && E->getVectorFactor() == TE.getVectorFactor()) {
15933 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15940 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15941 "Expected gather nodes with users only.");
15947 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15951 none_of(ExternalUses, [](
const ExternalUser &EU) {
15962 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
15969 for (ExternalUser &EU : ExternalUses) {
15970 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
15973 for (ExternalUser &EU : ExternalUses) {
15974 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
15975 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
15977 else dbgs() <<
" User: nullptr\n");
15978 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
15983 if (EphValues.count(EU.User))
15987 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
15989 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
15997 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16003 !ExtractCostCalculated.
insert(EU.Scalar).second)
16016 if (!UsedInserts.
insert(VU).second)
16020 const TreeEntry *ScalarTE = &EU.E;
16023 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16028 Value *Op0 =
II->getOperand(0);
16035 if (It == ShuffledInserts.
end()) {
16037 Data.InsertElements.emplace_back(VU);
16039 VecId = ShuffledInserts.
size() - 1;
16040 auto It = MinBWs.find(ScalarTE);
16041 if (It != MinBWs.end() &&
16043 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16045 unsigned BWSz = It->second.first;
16046 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16047 unsigned VecOpcode;
16048 if (DstBWSz < BWSz)
16049 VecOpcode = Instruction::Trunc;
16052 It->second.second ? Instruction::SExt : Instruction::ZExt;
16057 FTy->getNumElements()),
16060 <<
" for extending externally used vector with "
16061 "non-equal minimum bitwidth.\n");
16066 It->InsertElements.front() = VU;
16067 VecId = std::distance(ShuffledInserts.
begin(), It);
16069 int InIdx = *InsertIdx;
16071 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16074 Mask[InIdx] = EU.Lane;
16075 DemandedElts[VecId].setBit(InIdx);
16086 auto *ScalarTy = EU.Scalar->getType();
16087 const unsigned BundleWidth = EU.E.getVectorFactor();
16088 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16090 const TreeEntry *Entry = &EU.E;
16091 auto It = MinBWs.find(Entry);
16092 if (It != MinBWs.end()) {
16097 ? Instruction::ZExt
16098 : Instruction::SExt;
16103 << ExtraCost <<
"\n");
16107 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16108 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16109 << *VecTy <<
": " << ExtraCost <<
"\n");
16112 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16113 Entry->getOpcode() == Instruction::Load) {
16115 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16118 const Loop *L = LI->getLoopFor(Phi->getParent());
16119 return L && (Phi->getParent() ==
I->getParent() ||
16120 L == LI->getLoopFor(
I->getParent()));
16124 if (!ValueToExtUses) {
16125 ValueToExtUses.emplace();
16126 for (
const auto &
P :
enumerate(ExternalUses)) {
16128 if (IsPhiInLoop(
P.value()))
16131 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16138 auto OperandIsScalar = [&](
Value *V) {
16144 return !EE->hasOneUse() || !MustGather.contains(EE);
16147 return ValueToExtUses->contains(V);
16149 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16150 bool CanBeUsedAsScalarCast =
false;
16153 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16158 if (ScalarCost + OpCost <= ExtraCost) {
16159 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16160 ScalarCost += OpCost;
16164 if (CanBeUsedAsScalar) {
16165 bool KeepScalar = ScalarCost <= ExtraCost;
16169 bool IsProfitablePHIUser =
16171 VectorizableTree.front()->Scalars.size() > 2)) &&
16172 VectorizableTree.front()->hasState() &&
16173 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16177 auto *PHIUser = dyn_cast<PHINode>(U);
16178 return (!PHIUser ||
16179 PHIUser->getParent() !=
16181 VectorizableTree.front()->getMainOp())
16186 return ValueToExtUses->contains(V);
16188 if (IsProfitablePHIUser) {
16192 (!GatheredLoadsEntriesFirst.has_value() ||
16193 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16194 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16195 return ValueToExtUses->contains(V);
16197 auto It = ExtractsCount.
find(Entry);
16198 if (It != ExtractsCount.
end()) {
16199 assert(ScalarUsesCount >= It->getSecond().size() &&
16200 "Expected total number of external uses not less than "
16201 "number of scalar uses.");
16202 ScalarUsesCount -= It->getSecond().size();
16207 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16210 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16211 for (
Value *V : Inst->operands()) {
16212 auto It = ValueToExtUses->find(V);
16213 if (It != ValueToExtUses->end()) {
16215 ExternalUses[It->second].User =
nullptr;
16218 ExtraCost = ScalarCost;
16219 if (!IsPhiInLoop(EU))
16220 ExtractsCount[Entry].
insert(Inst);
16221 if (CanBeUsedAsScalarCast) {
16222 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16226 for (
Value *V : IOp->operands()) {
16227 auto It = ValueToExtUses->find(V);
16228 if (It != ValueToExtUses->end()) {
16230 ExternalUses[It->second].User =
nullptr;
16239 ExtractCost += ExtraCost;
16243 for (
Value *V : ScalarOpsFromCasts) {
16244 ExternalUsesAsOriginalScalar.insert(V);
16246 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16247 TEs.front()->findLaneForValue(V));
16251 if (!VectorizedVals.
empty()) {
16252 const TreeEntry &Root = *VectorizableTree.front();
16253 auto BWIt = MinBWs.find(&Root);
16254 if (BWIt != MinBWs.end()) {
16255 Type *DstTy = Root.Scalars.front()->getType();
16256 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
16258 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16259 if (OriginalSz != SrcSz) {
16260 unsigned Opcode = Instruction::Trunc;
16261 if (OriginalSz > SrcSz)
16262 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16268 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16275 Cost += ExtractCost;
16277 bool ForSingleMask) {
16279 unsigned VF = Mask.size();
16280 unsigned VecVF = TE->getVectorFactor();
16281 bool HasLargeIndex =
16282 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16283 if ((VF != VecVF && HasLargeIndex) ||
16286 if (HasLargeIndex) {
16288 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16294 dbgs() <<
"SLP: Adding cost " <<
C
16295 <<
" for final shuffle of insertelement external users.\n";
16296 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16298 return std::make_pair(TE,
true);
16301 if (!ForSingleMask) {
16303 for (
unsigned I = 0;
I < VF; ++
I) {
16305 ResizeMask[Mask[
I]] = Mask[
I];
16312 dbgs() <<
"SLP: Adding cost " <<
C
16313 <<
" for final shuffle of insertelement external users.\n";
16314 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16319 return std::make_pair(TE,
false);
16322 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16323 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16324 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16328 assert((TEs.size() == 1 || TEs.size() == 2) &&
16329 "Expected exactly 1 or 2 tree entries.");
16330 if (TEs.size() == 1) {
16332 VF = TEs.front()->getVectorFactor();
16333 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16337 (
Data.index() < VF &&
16338 static_cast<int>(
Data.index()) ==
Data.value());
16343 <<
" for final shuffle of insertelement "
16344 "external users.\n";
16345 TEs.front()->
dump();
16346 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16352 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16353 VF = TEs.front()->getVectorFactor();
16357 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16361 <<
" for final shuffle of vector node and external "
16362 "insertelement users.\n";
16363 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16364 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16372 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16373 EstimateShufflesCost);
16376 ShuffledInserts[
I].InsertElements.
front()->getType()),
16379 Cost -= InsertCost;
16383 if (ReductionBitWidth != 0) {
16384 assert(UserIgnoreList &&
"Expected reduction tree.");
16385 const TreeEntry &E = *VectorizableTree.front();
16386 auto It = MinBWs.find(&E);
16387 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16388 unsigned SrcSize = It->second.first;
16389 unsigned DstSize = ReductionBitWidth;
16390 unsigned Opcode = Instruction::Trunc;
16391 if (SrcSize < DstSize) {
16392 bool IsArithmeticExtendedReduction =
16395 return is_contained({Instruction::Add, Instruction::FAdd,
16396 Instruction::Mul, Instruction::FMul,
16397 Instruction::And, Instruction::Or,
16401 if (IsArithmeticExtendedReduction)
16403 Instruction::BitCast;
16405 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16407 if (Opcode != Instruction::BitCast) {
16409 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16411 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16414 switch (E.getOpcode()) {
16415 case Instruction::SExt:
16416 case Instruction::ZExt:
16417 case Instruction::Trunc: {
16418 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16419 CCH = getCastContextHint(*OpTE);
16425 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16429 <<
" for final resize for reduction from " << SrcVecTy
16430 <<
" to " << DstVecTy <<
"\n";
16431 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16436 std::optional<InstructionCost> SpillCost;
16439 Cost += *SpillCost;
16445 OS <<
"SLP: Spill Cost = ";
16450 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16451 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
16455 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
16466std::optional<TTI::ShuffleKind>
16467BoUpSLP::tryToGatherSingleRegisterExtractElements(
16473 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16489 if (Idx >= VecTy->getNumElements()) {
16493 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
16494 ExtractMask.reset(*Idx);
16499 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16504 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16505 return P1.second.size() > P2.second.size();
16508 const int UndefSz = UndefVectorExtracts.
size();
16509 unsigned SingleMax = 0;
16510 unsigned PairMax = 0;
16511 if (!Vectors.
empty()) {
16512 SingleMax = Vectors.
front().second.size() + UndefSz;
16513 if (Vectors.
size() > 1) {
16514 auto *ItNext = std::next(Vectors.
begin());
16515 PairMax = SingleMax + ItNext->second.size();
16518 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16519 return std::nullopt;
16525 if (SingleMax >= PairMax && SingleMax) {
16526 for (
int Idx : Vectors.
front().second)
16527 std::swap(GatheredExtracts[Idx], VL[Idx]);
16528 }
else if (!Vectors.
empty()) {
16529 for (
unsigned Idx : {0, 1})
16530 for (
int Idx : Vectors[Idx].second)
16531 std::swap(GatheredExtracts[Idx], VL[Idx]);
16534 for (
int Idx : UndefVectorExtracts)
16535 std::swap(GatheredExtracts[Idx], VL[Idx]);
16538 std::optional<TTI::ShuffleKind> Res =
16544 return std::nullopt;
16548 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
16569BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16570 SmallVectorImpl<int> &Mask,
16571 unsigned NumParts)
const {
16572 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16581 SmallVector<int> SubMask;
16582 std::optional<TTI::ShuffleKind> Res =
16583 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16584 ShufflesRes[Part] = Res;
16585 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16587 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16588 return Res.has_value();
16590 ShufflesRes.clear();
16591 return ShufflesRes;
16594std::optional<TargetTransformInfo::ShuffleKind>
16595BoUpSLP::isGatherShuffledSingleRegisterEntry(
16597 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
16601 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16602 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16603 TE =
TE->UserTreeIndex.UserTE;
16604 if (TE == VectorizableTree.front().get())
16605 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16606 return TE->UserTreeIndex;
16608 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16609 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16610 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16612 TE =
TE->UserTreeIndex.UserTE;
16616 const EdgeInfo TEUseEI = GetUserEntry(TE);
16618 return std::nullopt;
16619 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16624 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16625 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16626 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16629 TEInsertBlock = TEInsertPt->
getParent();
16631 if (!DT->isReachableFromEntry(TEInsertBlock))
16632 return std::nullopt;
16633 auto *NodeUI = DT->getNode(TEInsertBlock);
16634 assert(NodeUI &&
"Should only process reachable instructions");
16636 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16649 const BasicBlock *InsertBlock = InsertPt->getParent();
16650 auto *NodeEUI = DT->getNode(InsertBlock);
16653 assert((NodeUI == NodeEUI) ==
16654 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16655 "Different nodes should have different DFS numbers");
16657 if (TEInsertPt->
getParent() != InsertBlock &&
16658 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16660 if (TEInsertPt->
getParent() == InsertBlock &&
16673 SmallDenseMap<Value *, int> UsedValuesEntry;
16674 SmallPtrSet<const Value *, 16> VisitedValue;
16675 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16677 if ((TEPtr->getVectorFactor() != VL.
size() &&
16678 TEPtr->Scalars.size() != VL.
size()) ||
16679 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16683 for (
Value *V : VL) {
16690 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16691 unsigned EdgeIdx) {
16692 const TreeEntry *Ptr1 = User1;
16693 const TreeEntry *Ptr2 = User2;
16694 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16697 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16698 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16701 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16702 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16703 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16704 return Idx < It->second;
16708 for (
Value *V : VL) {
16712 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16713 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16714 if (TEPtr == TE || TEPtr->Idx == 0)
16717 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
16718 "Must contain at least single gathered value.");
16719 assert(TEPtr->UserTreeIndex &&
16720 "Expected only single user of a gather node.");
16721 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16723 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16724 UseEI.UserTE->hasState())
16729 : &getLastInstructionInBundle(UseEI.UserTE);
16730 if (TEInsertPt == InsertPt) {
16732 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16733 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16734 TEUseEI.UserTE->isAltShuffle()) &&
16736 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16737 (UseEI.UserTE->hasState() &&
16738 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16739 !UseEI.UserTE->isAltShuffle()) ||
16748 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16751 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16752 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16753 UseEI.UserTE->State == TreeEntry::Vectorize &&
16754 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16755 TEUseEI.UserTE != UseEI.UserTE)
16760 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16764 if (TEUseEI.UserTE != UseEI.UserTE &&
16765 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16766 HasGatherUser(TEUseEI.UserTE)))
16769 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16773 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16774 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16775 UseEI.UserTE->doesNotNeedToSchedule() &&
16780 if ((TEInsertBlock != InsertPt->
getParent() ||
16781 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16782 !CheckOrdering(InsertPt))
16785 if (CheckAndUseSameNode(TEPtr))
16791 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
16792 if (It != VTEs.end()) {
16793 const TreeEntry *VTE = *It;
16794 if (
none_of(
TE->CombinedEntriesWithIndices,
16795 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16796 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16797 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16801 if (CheckAndUseSameNode(VTE))
16807 const TreeEntry *VTE = VTEs.front();
16808 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16809 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16810 VTEs = VTEs.drop_front();
16812 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
16813 return MTE->State == TreeEntry::Vectorize;
16815 if (MIt == VTEs.end())
16819 if (
none_of(
TE->CombinedEntriesWithIndices,
16820 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16821 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16822 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16826 if (CheckAndUseSameNode(VTE))
16830 if (VToTEs.
empty())
16832 if (UsedTEs.
empty()) {
16840 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16842 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16846 if (!VToTEs.
empty()) {
16852 VToTEs = SavedVToTEs;
16857 if (Idx == UsedTEs.
size()) {
16861 if (UsedTEs.
size() == 2)
16863 UsedTEs.push_back(SavedVToTEs);
16864 Idx = UsedTEs.
size() - 1;
16870 if (UsedTEs.
empty()) {
16872 return std::nullopt;
16876 if (UsedTEs.
size() == 1) {
16879 UsedTEs.front().
end());
16880 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16881 return TE1->Idx < TE2->Idx;
16884 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
16885 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
16887 if (It != FirstEntries.end() &&
16888 ((*It)->getVectorFactor() == VL.size() ||
16889 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
16890 TE->ReuseShuffleIndices.size() == VL.size() &&
16891 (*It)->isSame(
TE->Scalars)))) {
16893 if ((*It)->getVectorFactor() == VL.size()) {
16894 std::iota(std::next(
Mask.begin(), Part * VL.size()),
16895 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
16897 SmallVector<int> CommonMask =
TE->getCommonMask();
16908 Entries.
push_back(FirstEntries.front());
16910 for (
auto &
P : UsedValuesEntry)
16912 VF = FirstEntries.front()->getVectorFactor();
16915 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
16917 DenseMap<int, const TreeEntry *> VFToTE;
16918 for (
const TreeEntry *TE : UsedTEs.front()) {
16919 unsigned VF =
TE->getVectorFactor();
16920 auto It = VFToTE.
find(VF);
16921 if (It != VFToTE.
end()) {
16922 if (It->second->Idx >
TE->Idx)
16923 It->getSecond() =
TE;
16930 UsedTEs.back().
end());
16931 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16932 return TE1->Idx < TE2->Idx;
16934 for (
const TreeEntry *TE : SecondEntries) {
16935 auto It = VFToTE.
find(
TE->getVectorFactor());
16936 if (It != VFToTE.
end()) {
16945 if (Entries.
empty()) {
16947 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16948 return TE1->Idx < TE2->Idx;
16950 Entries.
push_back(SecondEntries.front());
16951 VF = std::max(Entries.
front()->getVectorFactor(),
16952 Entries.
back()->getVectorFactor());
16954 VF = Entries.
front()->getVectorFactor();
16957 for (
const TreeEntry *
E : Entries)
16961 for (
auto &
P : UsedValuesEntry) {
16963 if (ValuesToEntries[Idx].
contains(
P.first)) {
16973 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
16980 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
16982 Value *In1 = PHI1->getIncomingValue(
I);
16997 auto MightBeIgnored = [=](
Value *
V) {
17001 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17006 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17007 Value *V1 = VL[Idx];
17008 bool UsedInSameVTE =
false;
17009 auto It = UsedValuesEntry.find(V1);
17010 if (It != UsedValuesEntry.end())
17011 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17012 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17019 SmallBitVector UsedIdxs(Entries.size());
17021 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17023 auto It = UsedValuesEntry.find(V);
17024 if (It == UsedValuesEntry.end())
17030 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17031 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17033 unsigned Idx = It->second;
17040 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17041 if (!UsedIdxs.test(
I))
17047 for (std::pair<unsigned, int> &Pair : EntryLanes)
17048 if (Pair.first ==
I)
17049 Pair.first = TempEntries.
size();
17052 Entries.swap(TempEntries);
17053 if (EntryLanes.size() == Entries.size() &&
17055 .slice(Part * VL.size(),
17056 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17062 return std::nullopt;
17065 bool IsIdentity = Entries.size() == 1;
17068 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17069 unsigned Idx = Part * VL.size() + Pair.second;
17072 (ForOrder ? std::distance(
17073 Entries[Pair.first]->Scalars.begin(),
17074 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17075 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17076 IsIdentity &=
Mask[Idx] == Pair.second;
17078 if (ForOrder || IsIdentity || Entries.empty()) {
17079 switch (Entries.size()) {
17081 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17085 if (EntryLanes.size() > 2 || VL.size() <= 2)
17092 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17094 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17095 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17096 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17097 for (
int Idx : SubMask) {
17105 assert(MaxElement >= 0 && MinElement >= 0 &&
17106 MaxElement % VF >= MinElement % VF &&
17107 "Expected at least single element.");
17108 unsigned NewVF = std::max<unsigned>(
17110 (MaxElement % VF) -
17111 (MinElement % VF) + 1));
17113 for (
int &Idx : SubMask) {
17116 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17117 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17125 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17126 auto GetShuffleCost = [&,
17127 &TTI = *TTI](ArrayRef<int>
Mask,
17130 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17132 Mask, Entries.front()->getInterleaveFactor()))
17134 return ::getShuffleCost(TTI,
17139 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17141 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17142 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17143 FirstShuffleCost = ShuffleCost;
17147 bool IsIdentity =
true;
17148 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17149 if (Idx >=
static_cast<int>(NewVF)) {
17154 IsIdentity &=
static_cast<int>(
I) == Idx;
17158 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17160 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17164 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17165 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17166 SecondShuffleCost = ShuffleCost;
17170 bool IsIdentity =
true;
17171 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
17172 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
17178 IsIdentity &=
static_cast<int>(
I) == Idx;
17183 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17185 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17193 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17195 const TreeEntry *BestEntry =
nullptr;
17196 if (FirstShuffleCost < ShuffleCost) {
17197 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17198 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17200 if (Idx >= static_cast<int>(VF))
17201 Idx = PoisonMaskElem;
17203 BestEntry = Entries.front();
17204 ShuffleCost = FirstShuffleCost;
17206 if (SecondShuffleCost < ShuffleCost) {
17207 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17208 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17210 if (Idx < static_cast<int>(VF))
17211 Idx = PoisonMaskElem;
17215 BestEntry = Entries[1];
17216 ShuffleCost = SecondShuffleCost;
17218 if (BuildVectorCost >= ShuffleCost) {
17221 Entries.push_back(BestEntry);
17229 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17231 return std::nullopt;
17235BoUpSLP::isGatherShuffledEntry(
17239 assert(NumParts > 0 && NumParts < VL.
size() &&
17240 "Expected positive number of registers.");
17243 if (TE == VectorizableTree.front().get() &&
17244 (!GatheredLoadsEntriesFirst.has_value() ||
17246 [](
const std::unique_ptr<TreeEntry> &TE) {
17247 return !
TE->isGather();
17252 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17255 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17256 "Expected only single user of the gather node.");
17258 "Number of scalars must be divisible by NumParts.");
17259 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17260 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17262 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17265 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17272 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17273 std::optional<TTI::ShuffleKind> SubRes =
17274 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17277 SubEntries.
clear();
17280 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17281 (SubEntries.
front()->isSame(
TE->Scalars) ||
17282 SubEntries.
front()->isSame(VL))) {
17284 LocalSubEntries.
swap(SubEntries);
17287 std::iota(
Mask.begin(),
Mask.end(), 0);
17289 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17292 Entries.emplace_back(1, LocalSubEntries.
front());
17298 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17306 Type *ScalarTy)
const {
17307 const unsigned VF = VL.
size();
17315 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17317 if (
V->getType() != ScalarTy)
17318 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
17322 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17329 ConstantShuffleMask[
I] =
I + VF;
17332 EstimateInsertCost(
I, V);
17335 bool IsAnyNonUndefConst =
17338 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17340 ConstantShuffleMask);
17344 if (!DemandedElements.
isZero())
17348 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17352Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
17353 auto It = EntryToLastInstruction.find(
E);
17354 if (It != EntryToLastInstruction.end())
17362 if (
E->hasState()) {
17363 Front =
E->getMainOp();
17364 Opcode =
E->getOpcode();
17371 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17372 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
17373 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
17375 [=](
Value *V) ->
bool {
17376 if (Opcode == Instruction::GetElementPtr &&
17377 !isa<GetElementPtrInst>(V))
17379 auto *I = dyn_cast<Instruction>(V);
17380 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17381 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17383 "Expected gathered loads or GEPs or instructions from same basic "
17386 auto FindLastInst = [&]() {
17388 for (
Value *V :
E->Scalars) {
17392 if (
E->isCopyableElement(
I))
17394 if (LastInst->
getParent() ==
I->getParent()) {
17399 assert(((Opcode == Instruction::GetElementPtr &&
17401 E->State == TreeEntry::SplitVectorize ||
17404 (GatheredLoadsEntriesFirst.has_value() &&
17405 Opcode == Instruction::Load &&
E->isGather() &&
17406 E->Idx < *GatheredLoadsEntriesFirst)) &&
17407 "Expected vector-like or non-GEP in GEP node insts only.");
17408 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
17412 if (!DT->isReachableFromEntry(
I->getParent()))
17414 auto *NodeA = DT->getNode(LastInst->
getParent());
17415 auto *NodeB = DT->getNode(
I->getParent());
17416 assert(NodeA &&
"Should only process reachable instructions");
17417 assert(NodeB &&
"Should only process reachable instructions");
17418 assert((NodeA == NodeB) ==
17419 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17420 "Different nodes should have different DFS numbers");
17421 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17428 auto FindFirstInst = [&]() {
17430 for (
Value *V :
E->Scalars) {
17434 if (
E->isCopyableElement(
I))
17436 if (FirstInst->
getParent() ==
I->getParent()) {
17437 if (
I->comesBefore(FirstInst))
17441 assert(((Opcode == Instruction::GetElementPtr &&
17445 "Expected vector-like or non-GEP in GEP node insts only.");
17446 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
17450 if (!DT->isReachableFromEntry(
I->getParent()))
17452 auto *NodeA = DT->getNode(FirstInst->
getParent());
17453 auto *NodeB = DT->getNode(
I->getParent());
17454 assert(NodeA &&
"Should only process reachable instructions");
17455 assert(NodeB &&
"Should only process reachable instructions");
17456 assert((NodeA == NodeB) ==
17457 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17458 "Different nodes should have different DFS numbers");
17459 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17465 if (
E->State == TreeEntry::SplitVectorize) {
17466 Res = FindLastInst();
17468 for (
auto *
E : Entries) {
17471 I = &getLastInstructionInBundle(
E);
17476 EntryToLastInstruction.try_emplace(
E, Res);
17481 if (GatheredLoadsEntriesFirst.has_value() &&
17482 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17483 Opcode == Instruction::Load) {
17484 Res = FindFirstInst();
17485 EntryToLastInstruction.try_emplace(
E, Res);
17491 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
17495 const auto *It = BlocksSchedules.find(BB);
17496 if (It == BlocksSchedules.end())
17498 for (
Value *V :
E->Scalars) {
17504 if (Bundles.
empty())
17507 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
17508 if (It != Bundles.
end())
17513 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
17514 if (!
E->isGather() && !Bundle) {
17515 if ((Opcode == Instruction::GetElementPtr &&
17518 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17521 return isa<PoisonValue>(V) ||
17522 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17523 E->isCopyableElement(V) ||
17524 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17526 Res = FindLastInst();
17528 Res = FindFirstInst();
17529 EntryToLastInstruction.try_emplace(
E, Res);
17538 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
17539 Res = Bundle->getBundle().back()->getInst();
17540 EntryToLastInstruction.try_emplace(
E, Res);
17563 Res = FindLastInst();
17564 assert(Res &&
"Failed to find last instruction in bundle");
17565 EntryToLastInstruction.try_emplace(
E, Res);
17569void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
17570 auto *Front =
E->getMainOp();
17571 Instruction *LastInst = &getLastInstructionInBundle(
E);
17572 assert(LastInst &&
"Failed to find last instruction in bundle");
17577 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17578 if (LastInstIt != LastInst->
getParent()->end() &&
17579 LastInstIt->getParent()->isLandingPad())
17580 LastInstIt = std::next(LastInstIt);
17583 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
17584 E->doesNotNeedToSchedule()) ||
17585 (GatheredLoadsEntriesFirst.has_value() &&
17586 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17587 E->getOpcode() == Instruction::Load)) {
17588 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17592 Builder.SetInsertPoint(
17596 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17599Value *BoUpSLP::gather(
17601 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
17607 SmallSet<int, 4> PostponedIndices;
17608 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17610 SmallPtrSet<BasicBlock *, 4> Visited;
17611 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17612 InsertBB = InsertBB->getSinglePredecessor();
17613 return InsertBB && InsertBB == InstBB;
17615 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17617 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17619 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17620 PostponedIndices.
insert(
I).second)
17624 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17627 if (
Scalar->getType() != Ty) {
17638 Scalar = Builder.CreateIntCast(
17652 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17657 GatherShuffleExtractSeq.insert(InsElt);
17663 User *UserOp =
nullptr;
17668 if (
V->getType()->isVectorTy()) {
17670 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17672 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
17674 if (SV->getOperand(0) == V)
17676 if (SV->getOperand(1) == V)
17682 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17684 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17687 "Failed to find shufflevector, caused by resize.");
17693 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17694 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17702 SmallVector<int> NonConsts;
17704 std::iota(
Mask.begin(),
Mask.end(), 0);
17705 Value *OriginalRoot = Root;
17708 SV->getOperand(0)->getType() == VecTy) {
17709 Root = SV->getOperand(0);
17710 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17713 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17722 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17727 Vec = OriginalRoot;
17729 Vec = CreateShuffle(Root, Vec, Mask);
17731 OI && OI->use_empty() &&
17732 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17733 return TE->VectorizedValue == OI;
17739 for (
int I : NonConsts)
17740 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17743 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17744 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17782 bool IsFinalized =
false;
17795 class ShuffleIRBuilder {
17808 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17809 CSEBlocks(CSEBlocks),
DL(DL) {}
17810 ~ShuffleIRBuilder() =
default;
17816 "Expected integer vector types only.");
17822 ->getIntegerBitWidth())
17823 V2 = Builder.CreateIntCast(
17826 V1 = Builder.CreateIntCast(
17830 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17832 GatherShuffleExtractSeq.insert(
I);
17833 CSEBlocks.insert(
I->getParent());
17842 unsigned VF = Mask.size();
17846 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17848 GatherShuffleExtractSeq.insert(
I);
17849 CSEBlocks.insert(
I->getParent());
17853 Value *createIdentity(
Value *V) {
return V; }
17854 Value *createPoison(
Type *Ty,
unsigned VF) {
17859 void resizeToMatch(
Value *&V1,
Value *&V2) {
17864 int VF = std::max(V1VF, V2VF);
17865 int MinVF = std::min(V1VF, V2VF);
17867 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
17869 Value *&
Op = MinVF == V1VF ? V1 : V2;
17870 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
17872 GatherShuffleExtractSeq.insert(
I);
17873 CSEBlocks.insert(
I->getParent());
17886 assert(V1 &&
"Expected at least one vector value.");
17887 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17888 R.CSEBlocks, *R.DL);
17889 return BaseShuffleAnalysis::createShuffle<Value *>(
17890 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17896 std::optional<bool> IsSigned = std::nullopt) {
17899 if (VecTy->getElementType() == ScalarTy->getScalarType())
17901 return Builder.CreateIntCast(
17902 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
17906 Value *getVectorizedValue(
const TreeEntry &E) {
17907 Value *Vec = E.VectorizedValue;
17910 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
17911 return !isa<PoisonValue>(V) &&
17912 !isKnownNonNegative(
17913 V, SimplifyQuery(*R.DL));
17919 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17923 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17924 unsigned NumParts,
bool &UseVecBaseAsInput) {
17925 UseVecBaseAsInput =
false;
17927 Value *VecBase =
nullptr;
17929 if (!E->ReorderIndices.empty()) {
17931 E->ReorderIndices.end());
17934 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
17939 VecBase = EI->getVectorOperand();
17941 VecBase = TEs.front()->VectorizedValue;
17942 assert(VecBase &&
"Expected vectorized value.");
17943 UniqueBases.
insert(VecBase);
17946 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17947 (NumParts != 1 &&
count(VL, EI) > 1) ||
17949 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17950 return UTEs.empty() || UTEs.size() > 1 ||
17951 (isa<GetElementPtrInst>(U) &&
17952 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17954 count_if(R.VectorizableTree,
17955 [&](const std::unique_ptr<TreeEntry> &TE) {
17956 return TE->UserTreeIndex.UserTE ==
17958 is_contained(VL, EI);
17962 R.eraseInstruction(EI);
17964 if (NumParts == 1 || UniqueBases.
size() == 1) {
17965 assert(VecBase &&
"Expected vectorized value.");
17966 return castToScalarTyElem(VecBase);
17968 UseVecBaseAsInput =
true;
17978 Value *Vec =
nullptr;
17985 constexpr int MaxBases = 2;
17987 auto VLMask =
zip(SubVL, SubMask);
17988 const unsigned VF = std::accumulate(
17989 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
17990 if (std::get<1>(D) == PoisonMaskElem)
17993 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
17994 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
17996 VecOp = TEs.front()->VectorizedValue;
17997 assert(VecOp &&
"Expected vectorized value.");
17998 const unsigned Size =
17999 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18000 return std::max(S, Size);
18002 for (
const auto [V,
I] : VLMask) {
18007 VecOp = TEs.front()->VectorizedValue;
18008 assert(VecOp &&
"Expected vectorized value.");
18009 VecOp = castToScalarTyElem(VecOp);
18010 Bases[
I / VF] = VecOp;
18012 if (!Bases.front())
18015 if (Bases.back()) {
18016 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18017 TransformToIdentity(SubMask);
18019 SubVec = Bases.front();
18025 ArrayRef<int> SubMask =
18026 Mask.slice(
P * SliceSize,
18029 return all_of(SubMask, [](
int Idx) {
18033 "Expected first part or all previous parts masked.");
18034 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18039 unsigned SubVecVF =
18041 NewVF = std::max(NewVF, SubVecVF);
18044 for (
int &Idx : SubMask)
18047 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18048 Vec = createShuffle(Vec, SubVec, VecMask);
18049 TransformToIdentity(VecMask);
18057 std::optional<Value *>
18063 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18065 return std::nullopt;
18068 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18069 return Builder.CreateAlignedLoad(
18076 IsFinalized =
false;
18077 CommonMask.clear();
18083 Value *V1 = getVectorizedValue(E1);
18084 Value *V2 = getVectorizedValue(E2);
18090 Value *V1 = getVectorizedValue(E1);
18095 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18098 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18099 V1 = castToScalarTyElem(V1);
18100 V2 = castToScalarTyElem(V2);
18101 if (InVectors.empty()) {
18102 InVectors.push_back(V1);
18103 InVectors.push_back(V2);
18104 CommonMask.assign(Mask.begin(), Mask.end());
18107 Value *Vec = InVectors.front();
18108 if (InVectors.size() == 2) {
18109 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18110 transformMaskAfterShuffle(CommonMask, CommonMask);
18113 Vec = createShuffle(Vec,
nullptr, CommonMask);
18114 transformMaskAfterShuffle(CommonMask, CommonMask);
18116 V1 = createShuffle(V1, V2, Mask);
18117 unsigned VF = std::max(getVF(V1), getVF(Vec));
18118 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18120 CommonMask[Idx] = Idx + VF;
18121 InVectors.front() = Vec;
18122 if (InVectors.size() == 2)
18123 InVectors.back() = V1;
18125 InVectors.push_back(V1);
18130 "castToScalarTyElem expects V1 to be FixedVectorType");
18131 V1 = castToScalarTyElem(V1);
18132 if (InVectors.empty()) {
18133 InVectors.push_back(V1);
18134 CommonMask.assign(Mask.begin(), Mask.end());
18137 const auto *It =
find(InVectors, V1);
18138 if (It == InVectors.end()) {
18139 if (InVectors.size() == 2 ||
18140 InVectors.front()->getType() != V1->
getType()) {
18141 Value *V = InVectors.front();
18142 if (InVectors.size() == 2) {
18143 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18144 transformMaskAfterShuffle(CommonMask, CommonMask);
18146 CommonMask.size()) {
18147 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
18148 transformMaskAfterShuffle(CommonMask, CommonMask);
18150 unsigned VF = std::max(CommonMask.size(), Mask.size());
18151 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18153 CommonMask[Idx] = V->getType() != V1->
getType()
18155 : Mask[Idx] + getVF(V1);
18156 if (V->getType() != V1->
getType())
18157 V1 = createShuffle(V1,
nullptr, Mask);
18158 InVectors.front() = V;
18159 if (InVectors.size() == 2)
18160 InVectors.back() = V1;
18162 InVectors.push_back(V1);
18167 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18169 InVectors.push_back(V1);
18174 for (
Value *V : InVectors)
18175 VF = std::max(VF, getVF(V));
18176 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18178 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18187 Value *Root =
nullptr) {
18188 return R.gather(VL, Root, ScalarTy,
18190 return createShuffle(V1, V2, Mask);
18199 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18204 IsFinalized =
true;
18207 if (InVectors.
size() == 2) {
18208 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18211 Vec = createShuffle(Vec,
nullptr, CommonMask);
18213 transformMaskAfterShuffle(CommonMask, CommonMask);
18215 "Expected vector length for the final value before action.");
18219 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18220 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18223 return createShuffle(V1, V2, Mask);
18225 InVectors.
front() = Vec;
18227 if (!SubVectors.empty()) {
18229 if (InVectors.
size() == 2) {
18230 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18233 Vec = createShuffle(Vec,
nullptr, CommonMask);
18235 transformMaskAfterShuffle(CommonMask, CommonMask);
18236 auto CreateSubVectors = [&](
Value *Vec,
18237 SmallVectorImpl<int> &CommonMask) {
18238 for (
auto [
E, Idx] : SubVectors) {
18239 Value *
V = getVectorizedValue(*
E);
18246 Type *OrigScalarTy = ScalarTy;
18249 Builder, Vec, V, InsertionIndex,
18250 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18252 ScalarTy = OrigScalarTy;
18253 if (!CommonMask.
empty()) {
18254 std::iota(std::next(CommonMask.
begin(), Idx),
18255 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
18261 if (SubVectorsMask.
empty()) {
18262 Vec = CreateSubVectors(Vec, CommonMask);
18265 copy(SubVectorsMask, SVMask.begin());
18266 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18269 I1 = I2 + CommonMask.
size();
18274 Vec = createShuffle(InsertVec, Vec, SVMask);
18275 transformMaskAfterShuffle(CommonMask, SVMask);
18277 InVectors.
front() = Vec;
18280 if (!ExtMask.
empty()) {
18281 if (CommonMask.
empty()) {
18285 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18288 NewMask[
I] = CommonMask[ExtMask[
I]];
18290 CommonMask.
swap(NewMask);
18293 if (CommonMask.
empty()) {
18294 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18295 return InVectors.
front();
18297 if (InVectors.
size() == 2)
18298 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18299 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18303 assert((IsFinalized || CommonMask.empty()) &&
18304 "Shuffle construction must be finalized.");
18308Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18312template <
typename BVTy,
typename ResTy,
typename... Args>
18313ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18315 assert(E->isGather() &&
"Expected gather node.");
18316 unsigned VF = E->getVectorFactor();
18318 bool NeedFreeze =
false;
18321 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18323 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18326 E->CombinedEntriesWithIndices.size());
18327 transform(
E->CombinedEntriesWithIndices, SubVectors.begin(),
18328 [&](
const auto &
P) {
18329 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18334 E->ReorderIndices.end());
18335 if (!ReorderMask.empty())
18341 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18343 if (
E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18346 SubVectorsMask.
clear();
18350 unsigned I,
unsigned SliceSize,
18351 bool IsNotPoisonous) {
18353 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18356 TreeEntry *UserTE =
E->UserTreeIndex.UserTE;
18357 unsigned EdgeIdx =
E->UserTreeIndex.EdgeIdx;
18358 if (UserTE->getNumOperands() != 2)
18360 if (!IsNotPoisonous) {
18361 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18362 [=](
const std::unique_ptr<TreeEntry> &TE) {
18363 return TE->UserTreeIndex.UserTE == UserTE &&
18364 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18366 if (It == VectorizableTree.end())
18369 if (!(*It)->ReorderIndices.empty()) {
18373 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18374 Value *V0 = std::get<0>(
P);
18375 Value *V1 = std::get<1>(
P);
18383 if ((
Mask.size() < InputVF &&
18386 (
Mask.size() == InputVF &&
18389 std::next(
Mask.begin(),
I * SliceSize),
18390 std::next(
Mask.begin(),
18397 std::next(
Mask.begin(),
I * SliceSize),
18398 std::next(
Mask.begin(),
18404 BVTy ShuffleBuilder(ScalarTy, Params...);
18405 ResTy Res = ResTy();
18406 SmallVector<int>
Mask;
18407 SmallVector<int> ExtractMask(GatheredScalars.size(),
PoisonMaskElem);
18409 Value *ExtractVecBase =
nullptr;
18410 bool UseVecBaseAsInput =
false;
18413 Type *OrigScalarTy = GatheredScalars.front()->getType();
18418 bool Resized =
false;
18420 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18421 if (!ExtractShuffles.
empty()) {
18423 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
18429 ExtractEntries.
append(TEs.begin(), TEs.end());
18431 if (std::optional<ResTy> Delayed =
18432 ShuffleBuilder.needToDelay(
E, ExtractEntries)) {
18434 PostponedGathers.insert(
E);
18439 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18440 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18441 ExtractVecBase = VecBase;
18443 if (VF == VecBaseTy->getNumElements() &&
18444 GatheredScalars.size() != VF) {
18446 GatheredScalars.append(VF - GatheredScalars.size(),
18454 if (!ExtractShuffles.
empty() || !
E->hasState() ||
18455 E->getOpcode() != Instruction::Load ||
18456 (((
E->hasState() &&
E->getOpcode() == Instruction::Load) ||
18460 return isa<LoadInst>(V) && isVectorized(V);
18462 (
E->hasState() &&
E->isAltShuffle()) ||
18463 all_of(
E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18465 (
E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18467 isGatherShuffledEntry(
E, GatheredScalars, Mask, Entries, NumParts);
18469 if (!GatherShuffles.
empty()) {
18470 if (std::optional<ResTy> Delayed =
18471 ShuffleBuilder.needToDelay(
E, Entries)) {
18473 PostponedGathers.insert(
E);
18478 if (GatherShuffles.
size() == 1 &&
18480 Entries.
front().front()->isSame(
E->Scalars)) {
18483 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18486 Mask.resize(
E->Scalars.size());
18487 const TreeEntry *FrontTE = Entries.
front().front();
18488 if (FrontTE->ReorderIndices.empty() &&
18489 ((FrontTE->ReuseShuffleIndices.empty() &&
18490 E->Scalars.size() == FrontTE->Scalars.size()) ||
18491 (
E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18492 std::iota(
Mask.begin(),
Mask.end(), 0);
18499 Mask[
I] = FrontTE->findLaneForValue(V);
18504 ShuffleBuilder.resetForSameNode();
18505 ShuffleBuilder.add(*FrontTE, Mask);
18507 Res = ShuffleBuilder.finalize(
E->getCommonMask(), {}, {});
18511 if (GatheredScalars.size() != VF &&
18513 return any_of(TEs, [&](
const TreeEntry *TE) {
18514 return TE->getVectorFactor() == VF;
18517 GatheredScalars.append(VF - GatheredScalars.size(),
18521 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
18527 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18528 SmallVectorImpl<int> &ReuseMask,
18529 bool IsRootPoison) {
18532 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18535 SmallVector<int> UndefPos;
18536 DenseMap<Value *, unsigned> UniquePositions;
18539 int NumNonConsts = 0;
18558 Scalars.
front() = OrigV;
18561 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18562 Scalars[Res.first->second] = OrigV;
18563 ReuseMask[
I] = Res.first->second;
18566 if (NumNonConsts == 1) {
18571 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18574 ReuseMask[SinglePos] = SinglePos;
18575 }
else if (!UndefPos.
empty() && IsSplat) {
18582 (
E->UserTreeIndex &&
any_of(
V->uses(), [
E](
const Use &U) {
18585 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18586 is_contained(E->UserTreeIndex.UserTE->Scalars,
18590 if (It != Scalars.
end()) {
18592 int Pos = std::distance(Scalars.
begin(), It);
18593 for (
int I : UndefPos) {
18595 ReuseMask[
I] = Pos;
18604 for (
int I : UndefPos) {
18613 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18614 bool IsNonPoisoned =
true;
18615 bool IsUsedInExpr =
true;
18616 Value *Vec1 =
nullptr;
18617 if (!ExtractShuffles.
empty()) {
18621 Value *Vec2 =
nullptr;
18622 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18626 if (UseVecBaseAsInput) {
18627 Vec1 = ExtractVecBase;
18629 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18635 Value *VecOp = EI->getVectorOperand();
18637 !TEs.
empty() && TEs.
front()->VectorizedValue)
18638 VecOp = TEs.
front()->VectorizedValue;
18641 }
else if (Vec1 != VecOp) {
18642 assert((!Vec2 || Vec2 == VecOp) &&
18643 "Expected only 1 or 2 vectors shuffle.");
18649 IsUsedInExpr =
false;
18652 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18655 IsUsedInExpr &= FindReusedSplat(
18658 ExtractMask.size(), IsNotPoisonedVec);
18659 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18660 IsNonPoisoned &= IsNotPoisonedVec;
18662 IsUsedInExpr =
false;
18667 if (!GatherShuffles.
empty()) {
18668 unsigned SliceSize =
18672 for (
const auto [
I, TEs] :
enumerate(Entries)) {
18675 "No shuffles with empty entries list expected.");
18679 "Expected shuffle of 1 or 2 entries.");
18683 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
18684 if (TEs.
size() == 1) {
18685 bool IsNotPoisonedVec =
18686 TEs.
front()->VectorizedValue
18690 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
18691 SliceSize, IsNotPoisonedVec);
18692 ShuffleBuilder.add(*TEs.
front(), VecMask);
18693 IsNonPoisoned &= IsNotPoisonedVec;
18695 IsUsedInExpr =
false;
18696 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
18697 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
18708 int EMSz = ExtractMask.size();
18709 int MSz =
Mask.size();
18712 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
18713 bool IsIdentityShuffle =
18714 ((UseVecBaseAsInput ||
18716 [](
const std::optional<TTI::ShuffleKind> &SK) {
18720 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
18722 (!GatherShuffles.
empty() &&
18724 [](
const std::optional<TTI::ShuffleKind> &SK) {
18728 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
18730 bool EnoughConstsForShuffle =
18740 (!IsIdentityShuffle ||
18741 (GatheredScalars.size() == 2 &&
18749 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
18750 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
18757 SmallVector<int> BVMask(GatheredScalars.size(),
PoisonMaskElem);
18758 TryPackScalars(GatheredScalars, BVMask,
true);
18759 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18760 ShuffleBuilder.add(BV, BVMask);
18764 (IsSingleShuffle && ((IsIdentityShuffle &&
18767 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18770 Res = ShuffleBuilder.finalize(
18771 E->ReuseShuffleIndices, SubVectors, SubVectorsMask,
E->Scalars.size(),
18772 [&](
Value *&Vec, SmallVectorImpl<int> &Mask,
auto CreateShuffle) {
18773 bool IsSplat = isSplat(NonConstants);
18774 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18775 TryPackScalars(NonConstants, BVMask, false);
18776 auto CheckIfSplatIsProfitable = [&]() {
18779 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18780 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18781 if (isa<ExtractElementInst>(V) || isVectorized(V))
18783 InstructionCost SplatCost = TTI->getVectorInstrCost(
18784 Instruction::InsertElement, VecTy, CostKind, 0,
18785 PoisonValue::get(VecTy), V);
18786 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18787 for (auto [Idx, I] : enumerate(BVMask))
18788 if (I != PoisonMaskElem)
18789 NewMask[Idx] = Mask.size();
18790 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18791 NewMask, CostKind);
18792 InstructionCost BVCost = TTI->getVectorInstrCost(
18793 Instruction::InsertElement, VecTy, CostKind,
18794 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18797 if (count(BVMask, PoisonMaskElem) <
18798 static_cast<int>(BVMask.size() - 1)) {
18799 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18800 for (auto [Idx, I] : enumerate(BVMask))
18801 if (I != PoisonMaskElem)
18803 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18804 VecTy, NewMask, CostKind);
18806 return SplatCost <= BVCost;
18808 if (!IsSplat ||
Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18812 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
18818 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18820 transform(BVMask, SplatMask.begin(), [](
int I) {
18821 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18824 BV = CreateShuffle(BV,
nullptr, SplatMask);
18827 Mask[Idx] = BVMask.size() + Idx;
18828 Vec = CreateShuffle(Vec, BV, Mask);
18836 SmallVector<int> ReuseMask(GatheredScalars.size(),
PoisonMaskElem);
18837 TryPackScalars(GatheredScalars, ReuseMask,
true);
18838 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
18839 ShuffleBuilder.add(BV, ReuseMask);
18840 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18845 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
18849 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18850 ShuffleBuilder.add(BV, Mask);
18851 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18856 Res = ShuffleBuilder.createFreeze(Res);
18860Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
18861 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
18863 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
18871 for (
Value *V : VL)
18884 IRBuilderBase::InsertPointGuard Guard(Builder);
18886 Value *
V =
E->Scalars.front();
18887 Type *ScalarTy =
V->getType();
18890 auto It = MinBWs.find(
E);
18891 if (It != MinBWs.end()) {
18897 if (
E->VectorizedValue)
18898 return E->VectorizedValue;
18900 if (
E->isGather()) {
18902 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
18903 setInsertPointAfterBundle(
E);
18904 Value *Vec = createBuildVector(
E, ScalarTy);
18905 E->VectorizedValue = Vec;
18908 if (
E->State == TreeEntry::SplitVectorize) {
18909 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
18910 "Expected exactly 2 combined entries.");
18911 setInsertPointAfterBundle(
E);
18913 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
18915 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
18916 "Expected same first part of scalars.");
18919 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
18921 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
18922 "Expected same second part of scalars.");
18924 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
18925 bool IsSigned =
false;
18926 auto It = MinBWs.find(OpE);
18927 if (It != MinBWs.end())
18928 IsSigned = It->second.second;
18931 if (isa<PoisonValue>(V))
18933 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18940 Op1 = Builder.CreateIntCast(
18945 GetOperandSignedness(&OpTE1));
18950 Op2 = Builder.CreateIntCast(
18955 GetOperandSignedness(&OpTE2));
18957 if (
E->ReorderIndices.empty()) {
18961 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
18964 if (ScalarTyNumElements != 1) {
18968 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
18970 E->CombinedEntriesWithIndices.back().second *
18971 ScalarTyNumElements);
18972 E->VectorizedValue = Vec;
18975 unsigned CommonVF =
18976 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
18979 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
18981 Op1 = Builder.CreateShuffleVector(Op1, Mask);
18985 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
18987 Op2 = Builder.CreateShuffleVector(Op2, Mask);
18989 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
18990 E->VectorizedValue = Vec;
18994 bool IsReverseOrder =
18996 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
18998 if (
E->getOpcode() == Instruction::Store &&
18999 E->State == TreeEntry::Vectorize) {
19000 ArrayRef<int>
Mask =
19001 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19002 E->ReorderIndices.size());
19003 ShuffleBuilder.add(V, Mask);
19004 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19005 E->State == TreeEntry::CompressVectorize) {
19006 ShuffleBuilder.addOrdered(V, {});
19008 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19011 E->CombinedEntriesWithIndices.size());
19013 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19014 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19017 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19018 "Expected either combined subnodes or reordering");
19019 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19022 assert(!
E->isGather() &&
"Unhandled state");
19023 unsigned ShuffleOrOp =
19024 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19026 auto GetOperandSignedness = [&](
unsigned Idx) {
19027 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19028 bool IsSigned =
false;
19029 auto It = MinBWs.find(OpE);
19030 if (It != MinBWs.end())
19031 IsSigned = It->second.second;
19034 if (isa<PoisonValue>(V))
19036 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19040 switch (ShuffleOrOp) {
19041 case Instruction::PHI: {
19042 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19043 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19044 "PHI reordering is free.");
19046 Builder.SetInsertPoint(PH->getParent(),
19047 PH->getParent()->getFirstNonPHIIt());
19049 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19053 Builder.SetInsertPoint(PH->getParent(),
19054 PH->getParent()->getFirstInsertionPt());
19057 V = FinalShuffle(V,
E);
19059 E->VectorizedValue =
V;
19066 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19073 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19077 if (!VisitedBBs.
insert(IBB).second) {
19080 TreeEntry *OpTE = getOperandEntry(
E,
I);
19081 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19082 OpTE->VectorizedValue = VecOp;
19088 Value *Vec = vectorizeOperand(
E,
I);
19089 if (VecTy != Vec->
getType()) {
19091 MinBWs.contains(getOperandEntry(
E,
I))) &&
19092 "Expected item in MinBWs.");
19093 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19099 "Invalid number of incoming values");
19100 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19101 return E->VectorizedValue;
19104 case Instruction::ExtractElement: {
19105 Value *
V =
E->getSingleOperand(0);
19106 setInsertPointAfterBundle(
E);
19107 V = FinalShuffle(V,
E);
19108 E->VectorizedValue =
V;
19111 case Instruction::ExtractValue: {
19113 Builder.SetInsertPoint(LI);
19114 Value *
Ptr = LI->getPointerOperand();
19115 LoadInst *
V = Builder.CreateAlignedLoad(VecTy,
Ptr, LI->getAlign());
19117 NewV = FinalShuffle(NewV,
E);
19118 E->VectorizedValue = NewV;
19121 case Instruction::InsertElement: {
19122 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19123 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
19124 OpE && !OpE->isGather() && OpE->hasState() &&
19125 !OpE->hasCopyableElements())
19128 setInsertPointAfterBundle(
E);
19129 Value *
V = vectorizeOperand(
E, 1);
19131 Type *ScalarTy =
Op.front()->getType();
19134 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
19135 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19136 V = Builder.CreateIntCast(
19146 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19148 const unsigned NumElts =
19150 const unsigned NumScalars =
E->Scalars.size();
19153 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19156 SmallVector<int>
Mask;
19157 if (!
E->ReorderIndices.empty()) {
19162 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19165 bool IsIdentity =
true;
19167 Mask.swap(PrevMask);
19168 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19171 IsIdentity &= InsertIdx -
Offset ==
I;
19174 if (!IsIdentity || NumElts != NumScalars) {
19175 Value *V2 =
nullptr;
19176 bool IsVNonPoisonous =
19178 SmallVector<int> InsertMask(Mask);
19179 if (NumElts != NumScalars &&
Offset == 0) {
19188 InsertMask[*InsertIdx] = *InsertIdx;
19189 if (!
Ins->hasOneUse())
19192 Ins->getUniqueUndroppableUser());
19194 SmallBitVector UseMask =
19195 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19196 SmallBitVector IsFirstPoison =
19198 SmallBitVector IsFirstUndef =
19200 if (!IsFirstPoison.
all()) {
19202 for (
unsigned I = 0;
I < NumElts;
I++) {
19204 IsFirstUndef.
test(
I)) {
19205 if (IsVNonPoisonous) {
19206 InsertMask[
I] =
I < NumScalars ?
I : 0;
19211 if (Idx >= NumScalars)
19212 Idx = NumScalars - 1;
19213 InsertMask[
I] = NumScalars + Idx;
19226 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19228 GatherShuffleExtractSeq.insert(
I);
19229 CSEBlocks.insert(
I->getParent());
19234 for (
unsigned I = 0;
I < NumElts;
I++) {
19238 SmallBitVector UseMask =
19239 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19240 SmallBitVector IsFirstUndef =
19242 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19243 NumElts != NumScalars) {
19244 if (IsFirstUndef.
all()) {
19246 SmallBitVector IsFirstPoison =
19248 if (!IsFirstPoison.
all()) {
19249 for (
unsigned I = 0;
I < NumElts;
I++) {
19251 InsertMask[
I] =
I + NumElts;
19254 V = Builder.CreateShuffleVector(
19260 GatherShuffleExtractSeq.insert(
I);
19261 CSEBlocks.insert(
I->getParent());
19265 SmallBitVector IsFirstPoison =
19267 for (
unsigned I = 0;
I < NumElts;
I++) {
19271 InsertMask[
I] += NumElts;
19273 V = Builder.CreateShuffleVector(
19274 FirstInsert->getOperand(0), V, InsertMask,
19277 GatherShuffleExtractSeq.insert(
I);
19278 CSEBlocks.insert(
I->getParent());
19283 ++NumVectorInstructions;
19284 E->VectorizedValue =
V;
19287 case Instruction::ZExt:
19288 case Instruction::SExt:
19289 case Instruction::FPToUI:
19290 case Instruction::FPToSI:
19291 case Instruction::FPExt:
19292 case Instruction::PtrToInt:
19293 case Instruction::IntToPtr:
19294 case Instruction::SIToFP:
19295 case Instruction::UIToFP:
19296 case Instruction::Trunc:
19297 case Instruction::FPTrunc:
19298 case Instruction::BitCast: {
19299 setInsertPointAfterBundle(
E);
19301 Value *InVec = vectorizeOperand(
E, 0);
19306 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
19308 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19311 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19312 if (SrcIt != MinBWs.end())
19313 SrcBWSz = SrcIt->second.first;
19314 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
19315 if (BWSz == SrcBWSz) {
19316 VecOpcode = Instruction::BitCast;
19317 }
else if (BWSz < SrcBWSz) {
19318 VecOpcode = Instruction::Trunc;
19319 }
else if (It != MinBWs.end()) {
19320 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19321 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19322 }
else if (SrcIt != MinBWs.end()) {
19323 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19325 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19327 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19328 !SrcIt->second.second) {
19329 VecOpcode = Instruction::UIToFP;
19331 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19333 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19334 V = FinalShuffle(V,
E);
19336 E->VectorizedValue =
V;
19337 ++NumVectorInstructions;
19340 case Instruction::FCmp:
19341 case Instruction::ICmp: {
19342 setInsertPointAfterBundle(
E);
19344 Value *
L = vectorizeOperand(
E, 0);
19345 Value *
R = vectorizeOperand(
E, 1);
19346 if (
L->getType() !=
R->getType()) {
19349 MinBWs.contains(getOperandEntry(
E, 0)) ||
19350 MinBWs.contains(getOperandEntry(
E, 1))) &&
19351 "Expected item in MinBWs.");
19356 ->getIntegerBitWidth()) {
19357 Type *CastTy =
R->getType();
19358 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19360 Type *CastTy =
L->getType();
19361 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19366 Value *
V = Builder.CreateCmp(P0, L, R);
19369 ICmp->setSameSign(
false);
19372 V = FinalShuffle(V,
E);
19374 E->VectorizedValue =
V;
19375 ++NumVectorInstructions;
19378 case Instruction::Select: {
19379 setInsertPointAfterBundle(
E);
19382 Value *True = vectorizeOperand(
E, 1);
19383 Value *False = vectorizeOperand(
E, 2);
19387 MinBWs.contains(getOperandEntry(
E, 1)) ||
19388 MinBWs.contains(getOperandEntry(
E, 2))) &&
19389 "Expected item in MinBWs.");
19390 if (True->
getType() != VecTy)
19391 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19392 if (False->
getType() != VecTy)
19393 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19398 assert(TrueNumElements >= CondNumElements &&
19399 TrueNumElements % CondNumElements == 0 &&
19400 "Cannot vectorize Instruction::Select");
19402 "Cannot vectorize Instruction::Select");
19403 if (CondNumElements != TrueNumElements) {
19406 Cond = Builder.CreateShuffleVector(
19411 "Cannot vectorize Instruction::Select");
19412 Value *
V = Builder.CreateSelect(
Cond, True, False);
19413 V = FinalShuffle(V,
E);
19415 E->VectorizedValue =
V;
19416 ++NumVectorInstructions;
19419 case Instruction::FNeg: {
19420 setInsertPointAfterBundle(
E);
19422 Value *
Op = vectorizeOperand(
E, 0);
19424 Value *
V = Builder.CreateUnOp(
19430 V = FinalShuffle(V,
E);
19432 E->VectorizedValue =
V;
19433 ++NumVectorInstructions;
19437 case Instruction::Freeze: {
19438 setInsertPointAfterBundle(
E);
19440 Value *
Op = vectorizeOperand(
E, 0);
19442 if (
Op->getType() != VecTy) {
19444 MinBWs.contains(getOperandEntry(
E, 0))) &&
19445 "Expected item in MinBWs.");
19446 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
19448 Value *
V = Builder.CreateFreeze(
Op);
19449 V = FinalShuffle(V,
E);
19451 E->VectorizedValue =
V;
19452 ++NumVectorInstructions;
19456 case Instruction::Add:
19457 case Instruction::FAdd:
19458 case Instruction::Sub:
19459 case Instruction::FSub:
19460 case Instruction::Mul:
19461 case Instruction::FMul:
19462 case Instruction::UDiv:
19463 case Instruction::SDiv:
19464 case Instruction::FDiv:
19465 case Instruction::URem:
19466 case Instruction::SRem:
19467 case Instruction::FRem:
19468 case Instruction::Shl:
19469 case Instruction::LShr:
19470 case Instruction::AShr:
19471 case Instruction::And:
19472 case Instruction::Or:
19473 case Instruction::Xor: {
19474 setInsertPointAfterBundle(
E);
19478 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19483 return CI && CI->getValue().countr_one() >= It->second.first;
19485 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
19486 E->VectorizedValue =
V;
19487 ++NumVectorInstructions;
19495 MinBWs.contains(getOperandEntry(
E, 0)) ||
19496 MinBWs.contains(getOperandEntry(
E, 1))) &&
19497 "Expected item in MinBWs.");
19499 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
19501 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
19504 Value *
V = Builder.CreateBinOp(
19511 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
19513 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19515 I->setHasNoUnsignedWrap(
false);
19518 V = FinalShuffle(V,
E);
19520 E->VectorizedValue =
V;
19521 ++NumVectorInstructions;
19525 case Instruction::Load: {
19528 setInsertPointAfterBundle(
E);
19532 FixedVectorType *StridedLoadTy =
nullptr;
19533 Value *PO = LI->getPointerOperand();
19534 if (
E->State == TreeEntry::Vectorize) {
19535 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19536 }
else if (
E->State == TreeEntry::CompressVectorize) {
19537 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19538 CompressEntryToData.at(
E);
19539 Align CommonAlignment = LI->getAlign();
19545 for (
int I : CompressMask)
19549 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
19552 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19555 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19566 }
else if (
E->State == TreeEntry::StridedVectorize) {
19569 PO = IsReverseOrder ? PtrN : Ptr0;
19570 Type *StrideTy = DL->getIndexType(PO->
getType());
19572 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
19573 StridedLoadTy = SPtrInfo.Ty;
19574 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
19575 unsigned StridedLoadEC =
19578 Value *Stride = SPtrInfo.StrideVal;
19580 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19581 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
19582 SCEVExpander Expander(*SE, *DL,
"strided-load-vec");
19583 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
19584 &*Builder.GetInsertPoint());
19587 Builder.CreateIntCast(Stride, StrideTy,
true);
19588 StrideVal = Builder.CreateMul(
19589 NewStride, ConstantInt::get(
19590 StrideTy, (IsReverseOrder ? -1 : 1) *
19592 DL->getTypeAllocSize(ScalarTy))));
19594 auto *Inst = Builder.CreateIntrinsic(
19595 Intrinsic::experimental_vp_strided_load,
19596 {StridedLoadTy, PO->
getType(), StrideTy},
19599 Builder.getInt32(StridedLoadEC)});
19600 Inst->addParamAttr(
19605 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19606 Value *VecPtr = vectorizeOperand(
E, 0);
19611 unsigned ScalarTyNumElements =
19613 unsigned VecTyNumElements =
19615 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19616 "Cannot expand getelementptr.");
19617 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19620 return Builder.getInt64(I % ScalarTyNumElements);
19622 VecPtr = Builder.CreateGEP(
19623 VecTy->getElementType(),
19624 Builder.CreateShuffleVector(
19630 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19632 Value *
V =
E->State == TreeEntry::CompressVectorize
19636 V = FinalShuffle(V,
E);
19637 E->VectorizedValue =
V;
19638 ++NumVectorInstructions;
19641 case Instruction::Store: {
19644 setInsertPointAfterBundle(
E);
19646 Value *VecValue = vectorizeOperand(
E, 0);
19647 if (VecValue->
getType() != VecTy)
19649 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19650 VecValue = FinalShuffle(VecValue,
E);
19654 if (
E->State == TreeEntry::Vectorize) {
19655 ST = Builder.CreateAlignedStore(VecValue,
Ptr,
SI->getAlign());
19657 assert(
E->State == TreeEntry::StridedVectorize &&
19658 "Expected either strided or consecutive stores.");
19659 if (!
E->ReorderIndices.empty()) {
19661 Ptr =
SI->getPointerOperand();
19664 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
19665 auto *Inst = Builder.CreateIntrinsic(
19666 Intrinsic::experimental_vp_strided_store,
19667 {VecTy,
Ptr->getType(), StrideTy},
19670 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19671 Builder.getAllOnesMask(VecTy->getElementCount()),
19672 Builder.getInt32(
E->Scalars.size())});
19673 Inst->addParamAttr(
19681 E->VectorizedValue =
V;
19682 ++NumVectorInstructions;
19685 case Instruction::GetElementPtr: {
19687 setInsertPointAfterBundle(
E);
19689 Value *Op0 = vectorizeOperand(
E, 0);
19692 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
19693 Value *OpVec = vectorizeOperand(
E, J);
19697 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19700 for (
Value *V :
E->Scalars) {
19707 V = FinalShuffle(V,
E);
19709 E->VectorizedValue =
V;
19710 ++NumVectorInstructions;
19714 case Instruction::Call: {
19716 setInsertPointAfterBundle(
E);
19721 CI,
ID, VecTy->getNumElements(),
19722 It != MinBWs.end() ? It->second.first : 0, TTI);
19725 VecCallCosts.first <= VecCallCosts.second;
19727 Value *ScalarArg =
nullptr;
19738 ScalarArg = CEI->getArgOperand(
I);
19741 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
19742 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19743 ScalarArg = Builder.getFalse();
19750 Value *OpVec = vectorizeOperand(
E,
I);
19751 ScalarArg = CEI->getArgOperand(
I);
19754 It == MinBWs.end()) {
19757 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
19758 }
else if (It != MinBWs.end()) {
19759 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
19768 if (!UseIntrinsic) {
19773 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19780 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
19783 V = FinalShuffle(V,
E);
19785 E->VectorizedValue =
V;
19786 ++NumVectorInstructions;
19789 case Instruction::ShuffleVector: {
19792 setInsertPointAfterBundle(
E);
19793 Value *Src = vectorizeOperand(
E, 0);
19796 SmallVector<int> NewMask(ThisMask.size());
19798 return SVSrc->getShuffleMask()[Mask];
19800 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19801 SVSrc->getOperand(1), NewMask);
19803 V = Builder.CreateShuffleVector(Src, ThisMask);
19808 V = FinalShuffle(V,
E);
19816 "Invalid Shuffle Vector Operand");
19820 setInsertPointAfterBundle(
E);
19821 LHS = vectorizeOperand(
E, 0);
19822 RHS = vectorizeOperand(
E, 1);
19824 setInsertPointAfterBundle(
E);
19825 LHS = vectorizeOperand(
E, 0);
19831 assert((It != MinBWs.end() ||
19832 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
19833 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
19834 MinBWs.contains(getOperandEntry(
E, 0)) ||
19835 MinBWs.contains(getOperandEntry(
E, 1))) &&
19836 "Expected item in MinBWs.");
19837 Type *CastTy = VecTy;
19843 ->getIntegerBitWidth())
19849 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
19851 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
19856 V0 = Builder.CreateBinOp(
19858 V1 = Builder.CreateBinOp(
19861 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
19864 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
19867 unsigned SrcBWSz = DL->getTypeSizeInBits(
19869 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19870 if (BWSz <= SrcBWSz) {
19871 if (BWSz < SrcBWSz)
19872 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
19874 "Expected same type as operand.");
19878 E->VectorizedValue =
LHS;
19879 ++NumVectorInstructions;
19883 V0 = Builder.CreateCast(
19885 V1 = Builder.CreateCast(
19890 for (
Value *V : {V0, V1}) {
19892 GatherShuffleExtractSeq.insert(
I);
19893 CSEBlocks.insert(
I->getParent());
19901 SmallVector<int>
Mask;
19902 E->buildAltOpShuffleMask(
19903 [
E,
this](Instruction *
I) {
19904 assert(
E->getMatchingMainOpOrAltOp(
I) &&
19905 "Unexpected main/alternate opcode");
19909 Mask, &OpScalars, &AltScalars);
19913 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
19916 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
19918 if (isa<PoisonValue>(V))
19920 auto *IV = cast<Instruction>(V);
19921 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19923 I->setHasNoUnsignedWrap(
false);
19925 DropNuwFlag(V0,
E->getOpcode());
19926 DropNuwFlag(V1,
E->getAltOpcode());
19932 V = Builder.CreateShuffleVector(V0, V1, Mask);
19935 GatherShuffleExtractSeq.insert(
I);
19936 CSEBlocks.insert(
I->getParent());
19940 E->VectorizedValue =
V;
19941 ++NumVectorInstructions;
19959 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
19962 EntryToLastInstruction.clear();
19964 for (
auto &BSIter : BlocksSchedules)
19965 scheduleBlock(*
this, BSIter.second.get());
19968 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19969 if (TE->isGather())
19971 (void)getLastInstructionInBundle(TE.get());
19975 Builder.SetInsertPoint(ReductionRoot->
getParent(),
19978 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
19982 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19983 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
19984 TE->UserTreeIndex.UserTE->hasState() &&
19985 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
19986 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
19987 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
19988 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
19989 all_of(TE->UserTreeIndex.UserTE->Scalars,
19990 [](
Value *V) { return isUsedOutsideBlock(V); })) {
19992 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
19996 for (
auto &Entry : GatherEntries) {
19998 Builder.SetInsertPoint(Entry.second);
19999 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20004 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20005 if (GatheredLoadsEntriesFirst.has_value() &&
20006 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20007 (!TE->isGather() || TE->UserTreeIndex)) {
20008 assert((TE->UserTreeIndex ||
20009 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20010 "Expected gathered load node.");
20019 for (
const TreeEntry *E : PostponedNodes) {
20020 auto *TE =
const_cast<TreeEntry *
>(E);
20022 TE->VectorizedValue =
nullptr;
20041 if (UI->comesBefore(InsertPt))
20044 Builder.SetInsertPoint(InsertPt);
20046 Builder.SetInsertPoint(PrevVec);
20048 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20051 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20052 Builder.GetInsertPoint()->comesBefore(VecI))
20053 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20054 Builder.GetInsertPoint());
20055 if (Vec->
getType() != PrevVec->getType()) {
20057 PrevVec->getType()->isIntOrIntVectorTy() &&
20058 "Expected integer vector types only.");
20059 std::optional<bool> IsSigned;
20060 for (
Value *V : TE->Scalars) {
20062 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20063 auto It = MinBWs.find(MNTE);
20064 if (It != MinBWs.end()) {
20065 IsSigned = IsSigned.value_or(
false) || It->second.second;
20070 if (IsSigned.value_or(
false))
20073 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20074 auto It = MinBWs.find(BVE);
20075 if (It != MinBWs.end()) {
20076 IsSigned = IsSigned.value_or(
false) || It->second.second;
20081 if (IsSigned.value_or(
false))
20085 IsSigned.value_or(
false) ||
20089 if (IsSigned.value_or(
false))
20093 if (IsSigned.value_or(
false)) {
20095 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20096 if (It != MinBWs.end())
20097 IsSigned = It->second.second;
20100 "Expected user node or perfect diamond match in MinBWs.");
20101 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20103 PrevVec->replaceAllUsesWith(Vec);
20104 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20107 auto It = PostponedValues.
find(PrevVec);
20108 if (It != PostponedValues.
end()) {
20109 for (TreeEntry *VTE : It->getSecond())
20110 VTE->VectorizedValue = Vec;
20130 for (
const auto &ExternalUse : ExternalUses) {
20131 Value *Scalar = ExternalUse.Scalar;
20138 const TreeEntry *E = &ExternalUse.E;
20139 assert(E &&
"Invalid scalar");
20140 assert(!E->isGather() &&
"Extracting from a gather list");
20142 if (E->getOpcode() == Instruction::GetElementPtr &&
20146 Value *Vec = E->VectorizedValue;
20147 assert(Vec &&
"Can't find vectorizable value");
20149 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20150 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20151 if (Scalar->getType() != Vec->
getType()) {
20152 Value *Ex =
nullptr;
20153 Value *ExV =
nullptr;
20155 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20156 auto It = ScalarToEEs.
find(Scalar);
20157 if (It != ScalarToEEs.
end()) {
20160 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20161 : Builder.GetInsertBlock());
20162 if (EEIt != It->second.end()) {
20163 Value *PrevV = EEIt->second.first;
20165 I && !ReplaceInst &&
20166 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20167 Builder.GetInsertPoint()->comesBefore(
I)) {
20168 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20169 Builder.GetInsertPoint());
20174 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20183 IgnoredExtracts.
insert(EE);
20186 auto *CloneInst = Inst->clone();
20187 CloneInst->insertBefore(Inst->getIterator());
20188 if (Inst->hasName())
20189 CloneInst->takeName(Inst);
20194 Value *V = ES->getVectorOperand();
20197 V = ETEs.front()->VectorizedValue;
20199 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20200 IV->comesBefore(IVec))
20201 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20203 Ex = Builder.CreateExtractElement(Vec, Lane);
20204 }
else if (
auto *VecTy =
20207 unsigned VecTyNumElements = VecTy->getNumElements();
20212 ExternalUse.Lane * VecTyNumElements);
20214 Ex = Builder.CreateExtractElement(Vec, Lane);
20219 if (Scalar->getType() != Ex->
getType())
20220 ExV = Builder.CreateIntCast(
20225 : &F->getEntryBlock(),
20226 std::make_pair(Ex, ExV));
20232 GatherShuffleExtractSeq.insert(ExI);
20233 CSEBlocks.insert(ExI->getParent());
20239 "In-tree scalar of vector type is not insertelement?");
20248 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20251 (ExternallyUsedValues.
count(Scalar) ||
20252 ExternalUsesWithNonUsers.count(Scalar) ||
20253 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20257 if (ExternalUsesAsOriginalScalar.contains(U))
20259 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20260 return !UseEntries.empty() &&
20261 (E->State == TreeEntry::Vectorize ||
20262 E->State == TreeEntry::StridedVectorize ||
20263 E->State == TreeEntry::CompressVectorize) &&
20264 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20265 return (UseEntry->State == TreeEntry::Vectorize ||
20267 TreeEntry::StridedVectorize ||
20269 TreeEntry::CompressVectorize) &&
20270 doesInTreeUserNeedToExtract(
20271 Scalar, getRootEntryInstruction(*UseEntry),
20275 "Scalar with nullptr User must be registered in "
20276 "ExternallyUsedValues map or remain as scalar in vectorized "
20280 if (
PHI->getParent()->isLandingPad())
20281 Builder.SetInsertPoint(
20284 PHI->getParent()->getLandingPadInst()->getIterator()));
20286 Builder.SetInsertPoint(
PHI->getParent(),
20287 PHI->getParent()->getFirstNonPHIIt());
20289 Builder.SetInsertPoint(VecI->getParent(),
20290 std::next(VecI->getIterator()));
20293 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20295 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20297 if (Scalar != NewInst) {
20300 "Extractelements should not be replaced.");
20301 Scalar->replaceAllUsesWith(NewInst);
20311 if (!UsedInserts.
insert(VU).second)
20314 auto BWIt = MinBWs.find(E);
20316 auto *ScalarTy = FTy->getElementType();
20317 auto Key = std::make_pair(Vec, ScalarTy);
20318 auto VecIt = VectorCasts.
find(
Key);
20319 if (VecIt == VectorCasts.
end()) {
20322 if (IVec->getParent()->isLandingPad())
20323 Builder.SetInsertPoint(IVec->getParent(),
20324 std::next(IVec->getParent()
20325 ->getLandingPadInst()
20328 Builder.SetInsertPoint(
20329 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20331 Builder.SetInsertPoint(IVec->getNextNode());
20333 Vec = Builder.CreateIntCast(
20338 BWIt->second.second);
20341 Vec = VecIt->second;
20348 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20355 unsigned Idx = *InsertIdx;
20356 if (It == ShuffledInserts.
end()) {
20358 It = std::next(ShuffledInserts.
begin(),
20359 ShuffledInserts.
size() - 1);
20364 Mask[Idx] = ExternalUse.Lane;
20376 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
20377 if (PH->getIncomingValue(
I) == Scalar) {
20379 PH->getIncomingBlock(
I)->getTerminator();
20381 Builder.SetInsertPoint(VecI->getParent(),
20382 std::next(VecI->getIterator()));
20384 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
20386 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20387 PH->setOperand(
I, NewInst);
20392 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20396 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20397 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20408 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20410 CombinedMask1[
I] = Mask[
I];
20412 CombinedMask2[
I] = Mask[
I] - VF;
20414 ShuffleInstructionBuilder ShuffleBuilder(
20416 ShuffleBuilder.add(V1, CombinedMask1);
20418 ShuffleBuilder.add(V2, CombinedMask2);
20419 return ShuffleBuilder.finalize({}, {}, {});
20422 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
20423 bool ForSingleMask) {
20424 unsigned VF =
Mask.size();
20427 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20428 Vec = CreateShuffle(Vec,
nullptr, Mask);
20429 return std::make_pair(Vec,
true);
20431 if (!ForSingleMask) {
20433 for (
unsigned I = 0;
I < VF; ++
I) {
20437 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20441 return std::make_pair(Vec,
false);
20445 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
20448 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
20449 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
20450 Builder.SetInsertPoint(LastInsert);
20451 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20456 return cast<VectorType>(Vec->getType())
20457 ->getElementCount()
20458 .getKnownMinValue();
20461 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20463 assert((Vals.size() == 1 || Vals.size() == 2) &&
20464 "Expected exactly 1 or 2 input values.");
20465 if (Vals.size() == 1) {
20468 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20469 ->getNumElements() ||
20470 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20471 return CreateShuffle(Vals.front(), nullptr, Mask);
20472 return Vals.front();
20474 return CreateShuffle(Vals.
front() ? Vals.
front()
20476 Vals.
back(), Mask);
20478 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20480 InsertElementInst *
II =
nullptr;
20481 if (It != ShuffledInserts[
I].InsertElements.rend())
20484 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20485 assert(
II &&
"Must be an insertelement instruction.");
20492 for (Instruction *
II :
reverse(Inserts)) {
20493 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20495 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20496 II->moveAfter(NewI);
20500 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
20501 IE->replaceUsesOfWith(
IE->getOperand(0),
20503 IE->replaceUsesOfWith(
IE->getOperand(1),
20507 CSEBlocks.insert(LastInsert->
getParent());
20512 for (
auto &TEPtr : VectorizableTree) {
20513 TreeEntry *
Entry = TEPtr.get();
20516 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20519 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20522 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20525 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20529 EE && IgnoredExtracts.contains(EE))
20536 for (User *U :
Scalar->users()) {
20541 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20544 "Deleting out-of-tree value");
20548 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20557 V->mergeDIAssignID(RemovedInsts);
20560 if (UserIgnoreList) {
20561 for (Instruction *
I : RemovedInsts) {
20562 const TreeEntry *
IE = getTreeEntries(
I).front();
20563 if (
IE->Idx != 0 &&
20564 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20565 (ValueToGatherNodes.lookup(
I).contains(
20566 VectorizableTree.front().get()) ||
20567 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20568 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20569 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20570 IE->UserTreeIndex &&
20572 !(GatheredLoadsEntriesFirst.has_value() &&
20573 IE->Idx >= *GatheredLoadsEntriesFirst &&
20574 VectorizableTree.front()->isGather() &&
20576 !(!VectorizableTree.front()->isGather() &&
20577 VectorizableTree.front()->isCopyableElement(
I)))
20582 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20583 (match(U.getUser(), m_LogicalAnd()) ||
20584 match(U.getUser(), m_LogicalOr())) &&
20585 U.getOperandNo() == 0;
20586 if (IsPoisoningLogicalOp) {
20587 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20590 return UserIgnoreList->contains(
U.getUser());
20594 for (SelectInst *SI : LogicalOpSelects)
20604 Builder.ClearInsertionPoint();
20605 InstrElementSize.clear();
20607 const TreeEntry &RootTE = *VectorizableTree.front();
20608 Value *Vec = RootTE.VectorizedValue;
20609 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20610 It != MinBWs.end() &&
20611 ReductionBitWidth != It->second.first) {
20612 IRBuilder<>::InsertPointGuard Guard(Builder);
20613 Builder.SetInsertPoint(ReductionRoot->getParent(),
20614 ReductionRoot->getIterator());
20615 Vec = Builder.CreateIntCast(
20619 It->second.second);
20625 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
20626 <<
" gather sequences instructions.\n");
20633 Loop *L = LI->getLoopFor(
I->getParent());
20638 BasicBlock *PreHeader = L->getLoopPreheader();
20646 auto *OpI = dyn_cast<Instruction>(V);
20647 return OpI && L->contains(OpI);
20653 CSEBlocks.insert(PreHeader);
20658 CSEWorkList.
reserve(CSEBlocks.size());
20661 assert(DT->isReachableFromEntry(
N));
20668 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
20669 "Different nodes should have different DFS numbers");
20670 return A->getDFSNumIn() <
B->getDFSNumIn();
20678 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
20681 if (I1->getType() != I2->getType())
20686 return I1->isIdenticalTo(I2);
20687 if (SI1->isIdenticalTo(SI2))
20689 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
20690 if (SI1->getOperand(
I) != SI2->getOperand(
I))
20693 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20697 unsigned LastUndefsCnt = 0;
20698 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
20704 NewMask[
I] != SM1[
I])
20707 NewMask[
I] = SM1[
I];
20711 return SM1.
size() - LastUndefsCnt > 1 &&
20715 SM1.
size() - LastUndefsCnt));
20721 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
20723 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
20724 "Worklist not sorted properly!");
20731 !GatherShuffleExtractSeq.contains(&In))
20736 bool Replaced =
false;
20739 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20740 DT->dominates(V->getParent(), In.getParent())) {
20741 In.replaceAllUsesWith(V);
20744 if (!NewMask.
empty())
20745 SI->setShuffleMask(NewMask);
20750 GatherShuffleExtractSeq.contains(V) &&
20751 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20752 DT->dominates(In.getParent(), V->getParent())) {
20754 V->replaceAllUsesWith(&In);
20757 if (!NewMask.
empty())
20758 SI->setShuffleMask(NewMask);
20766 Visited.push_back(&In);
20771 GatherShuffleExtractSeq.clear();
20774BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20777 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20778 for (
Value *V : VL) {
20779 if (S.isNonSchedulable(V))
20782 if (S.isCopyableElement(V)) {
20784 ScheduleCopyableData &SD =
20785 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
20787 BundlePtr->add(&SD);
20790 ScheduleData *BundleMember = getScheduleData(V);
20791 assert(BundleMember &&
"no ScheduleData for bundle member "
20792 "(maybe not in same basic block)");
20794 BundlePtr->add(BundleMember);
20795 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
20798 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
20804std::optional<BoUpSLP::ScheduleBundle *>
20806 const InstructionsState &S,
20813 bool HasCopyables = S.areInstructionsWithCopyableElements();
20815 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
20819 SmallVector<ScheduleData *> ControlDependentMembers;
20820 for (
Value *V : VL) {
20822 if (!
I || (HasCopyables && S.isCopyableElement(V)))
20824 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20825 for (
const Use &U :
I->operands()) {
20828 .first->getSecond();
20831 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
20832 if (ScheduleData *OpSD = getScheduleData(
Op);
20833 OpSD && OpSD->hasValidDependencies()) {
20834 OpSD->clearDirectDependencies();
20835 if (RegionHasStackSave ||
20837 ControlDependentMembers.
push_back(OpSD);
20842 if (!ControlDependentMembers.
empty()) {
20843 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20844 calculateDependencies(
Invalid,
true, SLP,
20845 ControlDependentMembers);
20852 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
20854 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
20857 SmallVector<ScheduleData *> ControlDependentMembers;
20858 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20859 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20860 for (ScheduleEntity *SE : Bundle.getBundle()) {
20862 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20863 BundleMember && BundleMember->hasValidDependencies()) {
20864 BundleMember->clearDirectDependencies();
20865 if (RegionHasStackSave ||
20867 BundleMember->getInst()))
20868 ControlDependentMembers.
push_back(BundleMember);
20873 if (SD->hasValidDependencies() &&
20874 (!S.areInstructionsWithCopyableElements() ||
20875 !S.isCopyableElement(SD->getInst())) &&
20876 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
20877 EI.UserTE->hasState() &&
20878 (!EI.UserTE->hasCopyableElements() ||
20879 !EI.UserTE->isCopyableElement(SD->getInst())))
20880 SD->clearDirectDependencies();
20881 for (
const Use &U : SD->getInst()->operands()) {
20884 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
20885 .first->getSecond();
20888 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
20890 if (ScheduleData *OpSD = getScheduleData(
Op);
20891 OpSD && OpSD->hasValidDependencies()) {
20892 OpSD->clearDirectDependencies();
20893 if (RegionHasStackSave ||
20895 ControlDependentMembers.
push_back(OpSD);
20906 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20907 for_each(ScheduleDataMap, [&](
auto &
P) {
20908 if (BB !=
P.first->getParent())
20910 ScheduleData *SD =
P.second;
20911 if (isInSchedulingRegion(*SD))
20912 SD->clearDependencies();
20914 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
20915 for_each(
P.second, [&](ScheduleCopyableData *SD) {
20916 if (isInSchedulingRegion(*SD))
20917 SD->clearDependencies();
20924 if (Bundle && !Bundle.getBundle().empty()) {
20925 if (S.areInstructionsWithCopyableElements() ||
20926 !ScheduleCopyableDataMap.empty())
20927 CheckIfNeedToClearDeps(Bundle);
20928 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
20930 calculateDependencies(Bundle, !ReSchedule, SLP,
20931 ControlDependentMembers);
20932 }
else if (!ControlDependentMembers.
empty()) {
20933 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20934 calculateDependencies(
Invalid, !ReSchedule, SLP,
20935 ControlDependentMembers);
20940 initialFillReadyList(ReadyInsts);
20947 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20948 !ReadyInsts.empty()) {
20949 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20950 assert(Picked->isReady() &&
"must be ready to schedule");
20951 schedule(*SLP, S, EI, Picked, ReadyInsts);
20952 if (Picked == &Bundle)
20959 for (
Value *V : VL) {
20960 if (S.isNonSchedulable(V))
20962 if (!extendSchedulingRegion(V, S)) {
20969 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20970 TryScheduleBundleImpl(
false,
Invalid);
20971 return std::nullopt;
20975 bool ReSchedule =
false;
20976 for (
Value *V : VL) {
20977 if (S.isNonSchedulable(V))
20981 if (!CopyableData.
empty()) {
20982 for (ScheduleCopyableData *SD : CopyableData)
20983 ReadyInsts.remove(SD);
20985 ScheduleData *BundleMember = getScheduleData(V);
20986 assert((BundleMember || S.isCopyableElement(V)) &&
20987 "no ScheduleData for bundle member (maybe not in same basic block)");
20993 ReadyInsts.remove(BundleMember);
20995 !Bundles.
empty()) {
20996 for (ScheduleBundle *
B : Bundles)
20997 ReadyInsts.remove(
B);
21000 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21007 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21008 <<
" was already scheduled\n");
21012 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21013 TryScheduleBundleImpl(ReSchedule, Bundle);
21014 if (!Bundle.isReady()) {
21015 for (ScheduleEntity *BD : Bundle.getBundle()) {
21019 if (BD->isReady()) {
21021 if (Bundles.
empty()) {
21022 ReadyInsts.insert(BD);
21025 for (ScheduleBundle *
B : Bundles)
21027 ReadyInsts.insert(
B);
21030 ScheduledBundlesList.pop_back();
21031 SmallVector<ScheduleData *> ControlDependentMembers;
21032 SmallPtrSet<Instruction *, 4> Visited;
21033 for (
Value *V : VL) {
21034 if (S.isNonSchedulable(V))
21037 if (S.isCopyableElement(
I)) {
21040 auto KV = std::make_pair(EI,
I);
21041 assert(ScheduleCopyableDataMap.contains(KV) &&
21042 "no ScheduleCopyableData for copyable element");
21043 ScheduleCopyableData *SD =
21044 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
21045 ScheduleCopyableDataMapByUsers[
I].remove(SD);
21048 const auto *It =
find(
Op,
I);
21049 assert(It !=
Op.end() &&
"Lane not set");
21050 SmallPtrSet<Instruction *, 4> Visited;
21052 int Lane = std::distance(
Op.begin(), It);
21053 assert(Lane >= 0 &&
"Lane not set");
21055 !EI.UserTE->ReorderIndices.empty())
21056 Lane = EI.UserTE->ReorderIndices[Lane];
21057 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21058 "Couldn't find extract lane");
21060 if (!Visited.
insert(In).second) {
21064 ScheduleCopyableDataMapByInstUser
21065 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
21068 }
while (It !=
Op.end());
21070 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
21071 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
21073 if (ScheduleCopyableDataMapByUsers[
I].
empty())
21074 ScheduleCopyableDataMapByUsers.erase(
I);
21075 ScheduleCopyableDataMap.erase(KV);
21077 if (ScheduleData *OpSD = getScheduleData(
I);
21078 OpSD && OpSD->hasValidDependencies()) {
21079 OpSD->clearDirectDependencies();
21080 if (RegionHasStackSave ||
21082 ControlDependentMembers.
push_back(OpSD);
21086 ScheduledBundles.find(
I)->getSecond().pop_back();
21088 if (!ControlDependentMembers.
empty()) {
21089 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21090 calculateDependencies(
Invalid,
false, SLP,
21091 ControlDependentMembers);
21093 return std::nullopt;
21098BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21100 if (ChunkPos >= ChunkSize) {
21101 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21104 return &(ScheduleDataChunks.back()[ChunkPos++]);
21107bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21108 Value *V,
const InstructionsState &S) {
21110 assert(
I &&
"bundle member must be an instruction");
21111 if (getScheduleData(
I))
21113 if (!ScheduleStart) {
21115 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21117 ScheduleEnd =
I->getNextNode();
21118 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21119 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21127 ++ScheduleStart->getIterator().getReverse();
21133 return II->isAssumeLikeIntrinsic();
21136 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21137 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21138 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21140 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21141 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21148 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21149 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21151 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21152 assert(
I->getParent() == ScheduleStart->getParent() &&
21153 "Instruction is in wrong basic block.");
21154 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21160 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21161 "Expected to reach top of the basic block or instruction down the "
21163 assert(
I->getParent() == ScheduleEnd->getParent() &&
21164 "Instruction is in wrong basic block.");
21165 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21167 ScheduleEnd =
I->getNextNode();
21168 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21169 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21173void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21175 ScheduleData *PrevLoadStore,
21176 ScheduleData *NextLoadStore) {
21177 ScheduleData *CurrentLoadStore = PrevLoadStore;
21182 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21184 SD = allocateScheduleDataChunks();
21185 ScheduleDataMap[
I] = SD;
21187 assert(!isInSchedulingRegion(*SD) &&
21188 "new ScheduleData already in scheduling region");
21189 SD->init(SchedulingRegionID,
I);
21191 if (
I->mayReadOrWriteMemory() &&
21195 Intrinsic::pseudoprobe))) {
21197 if (CurrentLoadStore) {
21198 CurrentLoadStore->setNextLoadStore(SD);
21200 FirstLoadStoreInRegion = SD;
21202 CurrentLoadStore = SD;
21207 RegionHasStackSave =
true;
21209 if (NextLoadStore) {
21210 if (CurrentLoadStore)
21211 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21213 LastLoadStoreInRegion = CurrentLoadStore;
21217void BoUpSLP::BlockScheduling::calculateDependencies(
21218 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21220 SmallVector<ScheduleEntity *> WorkList;
21221 auto ProcessNode = [&](ScheduleEntity *SE) {
21223 if (CD->hasValidDependencies())
21226 CD->initDependencies();
21227 CD->resetUnscheduledDeps();
21228 const EdgeInfo &EI = CD->getEdgeInfo();
21231 const auto *It =
find(
Op, CD->getInst());
21232 assert(It !=
Op.end() &&
"Lane not set");
21233 SmallPtrSet<Instruction *, 4> Visited;
21235 int Lane = std::distance(
Op.begin(), It);
21236 assert(Lane >= 0 &&
"Lane not set");
21238 !EI.UserTE->ReorderIndices.empty())
21239 Lane = EI.UserTE->ReorderIndices[Lane];
21240 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21241 "Couldn't find extract lane");
21243 if (EI.UserTE->isCopyableElement(In)) {
21246 if (ScheduleCopyableData *UseSD =
21247 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21248 CD->incDependencies();
21249 if (!UseSD->isScheduled())
21250 CD->incrementUnscheduledDeps(1);
21251 if (!UseSD->hasValidDependencies() ||
21252 (InsertInReadyList && UseSD->isReady()))
21255 }
else if (Visited.
insert(In).second) {
21256 if (ScheduleData *UseSD = getScheduleData(In)) {
21257 CD->incDependencies();
21258 if (!UseSD->isScheduled())
21259 CD->incrementUnscheduledDeps(1);
21260 if (!UseSD->hasValidDependencies() ||
21261 (InsertInReadyList && UseSD->isReady()))
21266 }
while (It !=
Op.end());
21267 if (CD->isReady() && CD->getDependencies() == 0 &&
21268 (EI.UserTE->hasState() &&
21269 (EI.UserTE->getMainOp()->getParent() !=
21270 CD->getInst()->getParent() ||
21272 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21273 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21274 auto *IU = dyn_cast<Instruction>(U);
21277 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21283 CD->incDependencies();
21284 CD->incrementUnscheduledDeps(1);
21290 if (BundleMember->hasValidDependencies())
21292 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21293 BundleMember->initDependencies();
21294 BundleMember->resetUnscheduledDeps();
21296 SmallDenseMap<Value *, unsigned> UserToNumOps;
21297 for (User *U : BundleMember->getInst()->users()) {
21300 if (ScheduleData *UseSD = getScheduleData(U)) {
21304 if (areAllOperandsReplacedByCopyableData(
21307 BundleMember->incDependencies();
21308 if (!UseSD->isScheduled())
21309 BundleMember->incrementUnscheduledDeps(1);
21310 if (!UseSD->hasValidDependencies() ||
21311 (InsertInReadyList && UseSD->isReady()))
21315 for (ScheduleCopyableData *UseSD :
21316 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21317 BundleMember->incDependencies();
21318 if (!UseSD->isScheduled())
21319 BundleMember->incrementUnscheduledDeps(1);
21320 if (!UseSD->hasValidDependencies() ||
21321 (InsertInReadyList && UseSD->isReady()))
21325 SmallPtrSet<const Instruction *, 4> Visited;
21328 if (!Visited.
insert(
I).second)
21330 auto *DepDest = getScheduleData(
I);
21331 assert(DepDest &&
"must be in schedule window");
21332 DepDest->addControlDependency(BundleMember);
21333 BundleMember->incDependencies();
21334 if (!DepDest->isScheduled())
21335 BundleMember->incrementUnscheduledDeps(1);
21336 if (!DepDest->hasValidDependencies() ||
21337 (InsertInReadyList && DepDest->isReady()))
21345 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21346 I != ScheduleEnd;
I =
I->getNextNode()) {
21351 MakeControlDependent(
I);
21359 if (RegionHasStackSave) {
21364 match(BundleMember->getInst(),
21366 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21367 I != ScheduleEnd;
I =
I->getNextNode()) {
21378 MakeControlDependent(
I);
21388 BundleMember->getInst()->mayReadOrWriteMemory()) {
21389 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21390 I != ScheduleEnd;
I =
I->getNextNode()) {
21396 MakeControlDependent(
I);
21403 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21404 if (!NextLoadStore)
21408 "NextLoadStore list for non memory effecting bundle?");
21411 unsigned NumAliased = 0;
21412 unsigned DistToSrc = 1;
21413 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21415 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21416 DepDest = DepDest->getNextLoadStore()) {
21417 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21427 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21429 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21436 DepDest->addMemoryDependency(BundleMember);
21437 BundleMember->incDependencies();
21438 if (!DepDest->isScheduled())
21439 BundleMember->incrementUnscheduledDeps(1);
21440 if (!DepDest->hasValidDependencies() ||
21441 (InsertInReadyList && DepDest->isReady()))
21465 "expected at least one instruction to schedule");
21467 WorkList.
push_back(Bundle.getBundle().front());
21469 SmallPtrSet<ScheduleBundle *, 16> Visited;
21470 while (!WorkList.
empty()) {
21475 CopyableBundle.
push_back(&CD->getBundle());
21476 Bundles = CopyableBundle;
21478 Bundles = getScheduleBundles(SD->getInst());
21480 if (Bundles.
empty()) {
21481 if (!SD->hasValidDependencies())
21483 if (InsertInReadyList && SD->isReady()) {
21484 ReadyInsts.insert(SD);
21485 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21489 for (ScheduleBundle *Bundle : Bundles) {
21490 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21492 assert(isInSchedulingRegion(*Bundle) &&
21493 "ScheduleData not in scheduling region");
21494 for_each(Bundle->getBundle(), ProcessNode);
21496 if (InsertInReadyList && SD->isReady()) {
21497 for (ScheduleBundle *Bundle : Bundles) {
21498 assert(isInSchedulingRegion(*Bundle) &&
21499 "ScheduleData not in scheduling region");
21500 if (!Bundle->isReady())
21502 ReadyInsts.insert(Bundle);
21510void BoUpSLP::BlockScheduling::resetSchedule() {
21512 "tried to reset schedule on block which has not been scheduled");
21513 for_each(ScheduleDataMap, [&](
auto &
P) {
21514 if (BB !=
P.first->getParent())
21516 ScheduleData *SD =
P.second;
21517 if (isInSchedulingRegion(*SD)) {
21518 SD->setScheduled(
false);
21519 SD->resetUnscheduledDeps();
21522 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21523 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21524 if (isInSchedulingRegion(*SD)) {
21525 SD->setScheduled(false);
21526 SD->resetUnscheduledDeps();
21530 for_each(ScheduledBundles, [&](
auto &
P) {
21531 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21532 if (isInSchedulingRegion(*Bundle))
21533 Bundle->setScheduled(false);
21537 for (
auto &
P : ScheduleCopyableDataMap) {
21538 if (isInSchedulingRegion(*
P.second)) {
21539 P.second->setScheduled(
false);
21540 P.second->resetUnscheduledDeps();
21543 ReadyInsts.clear();
21546void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21547 if (!BS->ScheduleStart)
21550 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21557 BS->resetSchedule();
21564 struct ScheduleDataCompare {
21565 bool operator()(
const ScheduleEntity *SD1,
21566 const ScheduleEntity *SD2)
const {
21567 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21570 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21575 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21576 I =
I->getNextNode()) {
21578 if (!Bundles.
empty()) {
21579 for (ScheduleBundle *Bundle : Bundles) {
21580 Bundle->setSchedulingPriority(Idx++);
21581 if (!Bundle->hasValidDependencies())
21582 BS->calculateDependencies(*Bundle,
false,
this);
21585 for (ScheduleCopyableData *SD :
reverse(SDs)) {
21586 ScheduleBundle &Bundle = SD->getBundle();
21587 Bundle.setSchedulingPriority(Idx++);
21588 if (!Bundle.hasValidDependencies())
21589 BS->calculateDependencies(Bundle,
false,
this);
21594 BS->getScheduleCopyableDataUsers(
I);
21595 if (ScheduleData *SD = BS->getScheduleData(
I)) {
21598 SDTEs.
front()->doesNotNeedToSchedule() ||
21600 "scheduler and vectorizer bundle mismatch");
21601 SD->setSchedulingPriority(Idx++);
21602 if (!SD->hasValidDependencies() &&
21603 (!CopyableData.
empty() ||
21604 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
21605 assert(TE->isGather() &&
"expected gather node");
21606 return TE->hasState() && TE->hasCopyableElements() &&
21607 TE->isCopyableElement(I);
21613 ScheduleBundle Bundle;
21615 BS->calculateDependencies(Bundle,
false,
this);
21618 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
21619 ScheduleBundle &Bundle = SD->getBundle();
21620 Bundle.setSchedulingPriority(Idx++);
21621 if (!Bundle.hasValidDependencies())
21622 BS->calculateDependencies(Bundle,
false,
this);
21625 BS->initialFillReadyList(ReadyInsts);
21627 Instruction *LastScheduledInst = BS->ScheduleEnd;
21630 SmallPtrSet<Instruction *, 16> Scheduled;
21631 while (!ReadyInsts.empty()) {
21632 auto *Picked = *ReadyInsts.begin();
21633 ReadyInsts.erase(ReadyInsts.begin());
21638 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21639 Instruction *PickedInst = BundleMember->getInst();
21641 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21642 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21643 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
21645 if (PickedInst->
getNextNode() != LastScheduledInst)
21647 LastScheduledInst = PickedInst;
21649 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21650 LastScheduledInst);
21654 if (PickedInst->
getNextNode() != LastScheduledInst)
21656 LastScheduledInst = PickedInst;
21658 auto Invalid = InstructionsState::invalid();
21663#ifdef EXPENSIVE_CHECKS
21667#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21669 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21670 I =
I->getNextNode()) {
21673 [](
const ScheduleBundle *Bundle) {
21674 return Bundle->isScheduled();
21676 "must be scheduled at this point");
21681 BS->ScheduleStart =
nullptr;
21689 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21694 auto E = InstrElementSize.find(V);
21695 if (E != InstrElementSize.end())
21712 Value *FirstNonBool =
nullptr;
21713 while (!Worklist.
empty()) {
21718 auto *Ty =
I->getType();
21721 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21729 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21737 for (
Use &U :
I->operands()) {
21739 if (Visited.
insert(J).second &&
21745 FirstNonBool = U.get();
21756 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21758 Width = DL->getTypeSizeInBits(V->getType());
21762 InstrElementSize[
I] = Width;
21767bool BoUpSLP::collectValuesToDemote(
21768 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
21771 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
21776 unsigned OrigBitWidth =
21777 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21784 if (NodesToKeepBWs.
contains(E.Idx))
21790 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
21791 if (isa<PoisonValue>(R))
21793 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21795 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
21798 if (getTreeEntries(V).
size() > 1)
21804 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
21810 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21815 unsigned BitWidth2 =
21816 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21817 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21823 BitWidth1 = std::min(BitWidth1, BitWidth2);
21828 auto FinalAnalysis = [&, TTI = TTI]() {
21829 if (!IsProfitableToDemote)
21832 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
21834 if (Res &&
E.isGather()) {
21835 if (
E.hasState()) {
21836 if (
const TreeEntry *SameTE =
21837 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
21839 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
21840 ToDemote, Visited, NodesToKeepBWs,
21841 MaxDepthLevel, IsProfitableToDemote,
21849 SmallPtrSet<Value *, 4> UniqueBases;
21850 for (
Value *V :
E.Scalars) {
21854 UniqueBases.
insert(EE->getVectorOperand());
21856 const unsigned VF =
E.Scalars.size();
21857 Type *OrigScalarTy =
E.Scalars.front()->getType();
21858 if (UniqueBases.
size() <= 2 ||
21871 if (
E.isGather() || !Visited.
insert(&
E).second ||
21873 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21874 return isa<InsertElementInst>(U) && !isVectorized(U);
21877 return FinalAnalysis();
21880 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21881 return isVectorized(U) ||
21882 (E.Idx == 0 && UserIgnoreList &&
21883 UserIgnoreList->contains(U)) ||
21884 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21885 !U->getType()->isScalableTy() &&
21886 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21887 }) && !IsPotentiallyTruncated(V,
BitWidth);
21892 bool &NeedToExit) {
21893 NeedToExit =
false;
21894 unsigned InitLevel = MaxDepthLevel;
21896 unsigned Level = InitLevel;
21897 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
21898 ToDemote, Visited, NodesToKeepBWs, Level,
21899 IsProfitableToDemote, IsTruncRoot)) {
21900 if (!IsProfitableToDemote)
21903 if (!FinalAnalysis())
21907 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21911 auto AttemptCheckBitwidth =
21912 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
21914 NeedToExit =
false;
21915 unsigned BestFailBitwidth = 0;
21917 if (Checker(
BitWidth, OrigBitWidth))
21919 if (BestFailBitwidth == 0 && FinalAnalysis())
21923 if (BestFailBitwidth == 0) {
21934 auto TryProcessInstruction =
21936 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
21940 for (
Value *V :
E.Scalars)
21941 (void)IsPotentiallyTruncated(V,
BitWidth);
21946 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
21949 bool NeedToExit =
false;
21950 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21954 if (!ProcessOperands(
Operands, NeedToExit))
21963 return IsProfitableToDemote;
21966 if (
E.State == TreeEntry::SplitVectorize)
21967 return TryProcessInstruction(
21969 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
21970 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
21972 switch (
E.getOpcode()) {
21976 case Instruction::Trunc:
21977 if (IsProfitableToDemoteRoot)
21978 IsProfitableToDemote =
true;
21979 return TryProcessInstruction(
BitWidth);
21980 case Instruction::ZExt:
21981 case Instruction::SExt:
21982 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
21983 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
21984 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
21986 IsProfitableToDemote =
true;
21987 return TryProcessInstruction(
BitWidth);
21991 case Instruction::Add:
21992 case Instruction::Sub:
21993 case Instruction::Mul:
21994 case Instruction::And:
21995 case Instruction::Or:
21996 case Instruction::Xor: {
21997 return TryProcessInstruction(
21998 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
22000 case Instruction::Freeze:
22001 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
22002 case Instruction::Shl: {
22005 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
22007 if (isa<PoisonValue>(V))
22009 auto *I = cast<Instruction>(V);
22010 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22011 return AmtKnownBits.getMaxValue().ult(BitWidth);
22014 return TryProcessInstruction(
22015 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
22017 case Instruction::LShr: {
22021 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22023 if (isa<PoisonValue>(V))
22025 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22026 if (E.isCopyableElement(V))
22027 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22028 auto *I = cast<Instruction>(V);
22029 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22030 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22031 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22032 SimplifyQuery(*DL));
22035 return TryProcessInstruction(
22036 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22039 case Instruction::AShr: {
22043 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22045 if (isa<PoisonValue>(V))
22047 auto *I = cast<Instruction>(V);
22048 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22049 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22050 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22052 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22055 return TryProcessInstruction(
22056 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22059 case Instruction::UDiv:
22060 case Instruction::URem: {
22062 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22065 auto *I = cast<Instruction>(V);
22066 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22067 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22068 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22071 return TryProcessInstruction(
22072 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
22076 case Instruction::Select: {
22077 return TryProcessInstruction(
22078 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
22082 case Instruction::PHI: {
22083 const unsigned NumOps =
E.getNumOperands();
22086 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
22091 case Instruction::Call: {
22096 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
22097 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
22100 function_ref<bool(
unsigned,
unsigned)> CallChecker;
22101 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22104 auto *I = cast<Instruction>(V);
22105 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22106 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22107 return MaskedValueIsZero(I->getOperand(0), Mask,
22108 SimplifyQuery(*DL)) &&
22109 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22111 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22112 "Expected min/max intrinsics only.");
22113 unsigned SignBits = OrigBitWidth -
BitWidth;
22115 unsigned Op0SignBits =
22117 unsigned Op1SignBits =
22119 return SignBits <= Op0SignBits &&
22120 ((SignBits != Op0SignBits &&
22123 SimplifyQuery(*DL))) &&
22124 SignBits <= Op1SignBits &&
22125 ((SignBits != Op1SignBits &&
22130 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22133 auto *I = cast<Instruction>(V);
22134 unsigned SignBits = OrigBitWidth - BitWidth;
22135 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22136 unsigned Op0SignBits =
22137 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22138 return SignBits <= Op0SignBits &&
22139 ((SignBits != Op0SignBits &&
22140 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22141 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22144 if (
ID != Intrinsic::abs) {
22145 Operands.push_back(getOperandEntry(&
E, 1));
22146 CallChecker = CompChecker;
22148 CallChecker = AbsChecker;
22151 std::numeric_limits<InstructionCost::CostType>::max();
22153 unsigned VF =
E.Scalars.size();
22155 auto Checker = [&](
unsigned BitWidth, unsigned) {
22163 if (
Cost < BestCost) {
22169 [[maybe_unused]]
bool NeedToExit;
22170 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22180 return FinalAnalysis();
22187 bool IsStoreOrInsertElt =
22188 VectorizableTree.front()->hasState() &&
22189 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
22190 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22191 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22192 ExtraBitWidthNodes.size() <= 1 &&
22193 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22194 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22197 unsigned NodeIdx = 0;
22198 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22202 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22203 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22204 "Unexpected tree is graph.");
22208 bool IsTruncRoot =
false;
22209 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22212 if (NodeIdx != 0 &&
22213 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22214 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22215 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22216 IsTruncRoot =
true;
22218 IsProfitableToDemoteRoot =
true;
22223 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22227 auto ComputeMaxBitWidth =
22228 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22229 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22233 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22234 !NodesToKeepBWs.
contains(E.Idx) &&
22235 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22237 return V->hasOneUse() || isa<Constant>(V) ||
22238 (!V->hasNUsesOrMore(UsesLimit) &&
22239 none_of(V->users(), [&](User *U) {
22240 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22241 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22242 if (TEs.empty() || is_contained(TEs, UserTE))
22244 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22246 isa<SIToFPInst, UIToFPInst>(U) ||
22247 (UserTE->hasState() &&
22248 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22249 SelectInst>(UserTE->getMainOp()) ||
22250 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22252 unsigned UserTESz = DL->getTypeSizeInBits(
22253 UserTE->Scalars.front()->getType());
22254 if (all_of(TEs, [&](const TreeEntry *TE) {
22255 auto It = MinBWs.find(TE);
22256 return It != MinBWs.end() &&
22257 It->second.first > UserTESz;
22260 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22264 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22265 auto It = MinBWs.find(UserTE);
22266 if (It != MinBWs.end())
22267 return It->second.first;
22268 unsigned MaxBitWidth =
22269 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22270 MaxBitWidth =
bit_ceil(MaxBitWidth);
22271 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22273 return MaxBitWidth;
22279 unsigned VF = E.getVectorFactor();
22280 Type *ScalarTy = E.Scalars.front()->getType();
22287 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22296 unsigned MaxBitWidth = 1u;
22304 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22305 if (isa<PoisonValue>(R))
22307 KnownBits Known = computeKnownBits(R, *DL);
22308 return Known.isNonNegative();
22311 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22312 E.UserTreeIndex.UserTE->hasState() &&
22313 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22315 std::min(DL->getTypeSizeInBits(
22316 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22317 DL->getTypeSizeInBits(ScalarTy));
22321 for (
Value *Root : E.Scalars) {
22327 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22343 if (!IsKnownPositive)
22348 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22351 APInt Mask = DB->getDemandedBits(
I);
22352 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22354 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22357 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22362 if (NumParts > 1 &&
22370 unsigned Opcode = E.getOpcode();
22371 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22372 Opcode == Instruction::SExt ||
22373 Opcode == Instruction::ZExt || NumParts > 1;
22378 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22379 bool NeedToDemote = IsProfitableToDemote;
22381 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22382 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22383 NeedToDemote, IsTruncRoot) ||
22384 (MaxDepthLevel <= Limit &&
22385 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22386 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22387 DL->getTypeSizeInBits(TreeRootIT) /
22388 DL->getTypeSizeInBits(
22389 E.getMainOp()->getOperand(0)->getType()) >
22393 MaxBitWidth =
bit_ceil(MaxBitWidth);
22395 return MaxBitWidth;
22402 if (UserIgnoreList &&
22406 if (
all_of(*UserIgnoreList,
22411 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22412 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22413 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22414 Builder.getInt1Ty()) {
22415 ReductionBitWidth = 1;
22417 for (
Value *V : *UserIgnoreList) {
22421 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
22422 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22425 unsigned BitWidth2 = BitWidth1;
22428 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22430 ReductionBitWidth =
22431 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22433 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22434 ReductionBitWidth = 8;
22436 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22439 bool IsTopRoot = NodeIdx == 0;
22440 while (NodeIdx < VectorizableTree.size() &&
22441 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22442 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22443 RootDemotes.push_back(NodeIdx);
22445 IsTruncRoot =
true;
22447 bool IsSignedCmp =
false;
22448 if (UserIgnoreList &&
22452 IsSignedCmp =
true;
22453 while (NodeIdx < VectorizableTree.size()) {
22455 unsigned Limit = 2;
22457 ReductionBitWidth ==
22458 DL->getTypeSizeInBits(
22459 VectorizableTree.front()->Scalars.front()->getType()))
22461 unsigned MaxBitWidth = ComputeMaxBitWidth(
22462 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22463 IsTruncRoot, IsSignedCmp);
22464 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22465 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22466 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22467 else if (MaxBitWidth == 0)
22468 ReductionBitWidth = 0;
22471 for (
unsigned Idx : RootDemotes) {
22472 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
22473 uint32_t OrigBitWidth =
22474 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22475 if (OrigBitWidth > MaxBitWidth) {
22483 RootDemotes.clear();
22485 IsProfitableToDemoteRoot =
true;
22487 if (ExtraBitWidthNodes.empty()) {
22488 NodeIdx = VectorizableTree.size();
22490 unsigned NewIdx = 0;
22492 NewIdx = *ExtraBitWidthNodes.begin();
22493 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22494 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22497 NodeIdx < VectorizableTree.size() &&
22498 VectorizableTree[NodeIdx]->UserTreeIndex &&
22499 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22500 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22501 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22502 Instruction::Trunc &&
22503 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22505 NodeIdx < VectorizableTree.size() &&
22506 VectorizableTree[NodeIdx]->UserTreeIndex &&
22507 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22508 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22509 Instruction::ICmp &&
22511 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22513 auto *IC = dyn_cast<ICmpInst>(V);
22514 return IC && (IC->isSigned() ||
22515 !isKnownNonNegative(IC->getOperand(0),
22516 SimplifyQuery(*DL)) ||
22517 !isKnownNonNegative(IC->getOperand(1),
22518 SimplifyQuery(*DL)));
22524 if (MaxBitWidth == 0 ||
22528 if (UserIgnoreList)
22529 AnalyzedMinBWVals.insert_range(TreeRoot);
22536 for (
unsigned Idx : ToDemote) {
22537 TreeEntry *
TE = VectorizableTree[Idx].get();
22538 if (MinBWs.contains(TE))
22541 if (isa<PoisonValue>(R))
22543 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22545 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22586 DL = &
F.getDataLayout();
22594 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
22596 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
22601 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
22604 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
22608 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
22614 DT->updateDFSNumbers();
22617 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
22622 R.clearReductionData();
22623 collectSeedInstructions(BB);
22626 if (!Stores.empty()) {
22628 <<
" underlying objects.\n");
22629 Changed |= vectorizeStoreChains(R);
22633 Changed |= vectorizeChainsInBlock(BB, R);
22638 if (!GEPs.empty()) {
22640 <<
" underlying objects.\n");
22641 Changed |= vectorizeGEPIndices(BB, R);
22646 R.optimizeGatherSequence();
22654 unsigned Idx,
unsigned MinVF,
22659 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22660 unsigned VF = Chain.
size();
22666 VF < 2 || VF < MinVF) {
22674 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
22678 for (
Value *V : Chain)
22681 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
22682 InstructionsState S =
Analysis.buildInstructionsState(
22686 bool IsAllowedSize =
22690 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22691 (!S.getMainOp()->isSafeToRemove() ||
22694 return !isa<ExtractElementInst>(V) &&
22695 (V->getNumUses() > Chain.size() ||
22696 any_of(V->users(), [&](User *U) {
22697 return !Stores.contains(U);
22700 (ValOps.
size() > Chain.size() / 2 && !S)) {
22701 Size = (!IsAllowedSize && S) ? 1 : 2;
22705 if (
R.isLoadCombineCandidate(Chain))
22707 R.buildTree(Chain);
22709 if (
R.isTreeTinyAndNotFullyVectorizable()) {
22710 if (
R.isGathered(Chain.front()) ||
22712 return std::nullopt;
22713 Size =
R.getCanonicalGraphSize();
22716 if (
R.isProfitableToReorder()) {
22717 R.reorderTopToBottom();
22718 R.reorderBottomToTop();
22720 R.transformNodes();
22721 R.buildExternalUses();
22723 R.computeMinimumValueSizes();
22725 Size =
R.getCanonicalGraphSize();
22726 if (S && S.getOpcode() == Instruction::Load)
22734 using namespace ore;
22736 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
22738 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
22739 <<
" and with tree size "
22740 <<
NV(
"TreeSize",
R.getTreeSize()));
22754 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22755 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22756 unsigned Size = First ? Val.first : Val.second;
22768 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22769 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22770 unsigned P = First ? Val.first : Val.second;
22773 return V + (P - Mean) * (P - Mean);
22776 return Dev * 96 / (Mean * Mean) == 0;
22784class RelatedStoreInsts {
22787 : AllStores(AllStores) {
22788 reset(BaseInstrIdx);
22791 void reset(
unsigned NewBaseInstr) {
22792 assert(NewBaseInstr < AllStores.size() &&
22793 "Instruction index out of bounds");
22794 BaseInstrIdx = NewBaseInstr;
22796 insertOrLookup(NewBaseInstr, 0);
22803 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
22804 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22805 return Inserted ? std::nullopt : std::make_optional(It->second);
22808 using DistToInstMap = std::map<int64_t, unsigned>;
22809 const DistToInstMap &getStores()
const {
return Instrs; }
22813 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
22814 ScalarEvolution &SE)
const {
22815 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22818 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
22824 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
22825 int64_t DistFromCurBase) {
22826 DistToInstMap PrevSet = std::move(Instrs);
22827 reset(NewBaseInstIdx);
22832 for (
auto [Dist, InstIdx] : PrevSet) {
22833 if (InstIdx >= MinSafeIdx)
22834 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22840 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
22841 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
22842 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
22847 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22848 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22853 unsigned BaseInstrIdx;
22856 DistToInstMap Instrs;
22864bool SLPVectorizerPass::vectorizeStores(
22866 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22873 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22874 int64_t PrevDist = -1;
22878 auto &[Dist, InstIdx] =
Data;
22879 if (
Operands.empty() || Dist - PrevDist == 1) {
22880 Operands.push_back(Stores[InstIdx]);
22882 if (Idx != StoreSeq.size() - 1)
22887 Operands.push_back(Stores[InstIdx]);
22893 .
insert({Operands.front(),
22894 cast<StoreInst>(Operands.front())->getValueOperand(),
22896 cast<StoreInst>(Operands.back())->getValueOperand(),
22901 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
22902 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
22906 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22908 Type *StoreTy =
Store->getValueOperand()->getType();
22909 Type *ValueTy = StoreTy;
22911 ValueTy = Trunc->getSrcTy();
22920 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22923 MinVF = std::max<unsigned>(2, MinVF);
22925 if (MaxVF < MinVF) {
22926 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22928 <<
"MinVF (" << MinVF <<
")\n");
22932 unsigned NonPowerOf2VF = 0;
22937 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
22939 NonPowerOf2VF = CandVF;
22940 assert(NonPowerOf2VF != MaxVF &&
22941 "Non-power-of-2 VF should not be equal to MaxVF");
22948 unsigned MaxRegVF = MaxVF;
22951 if (MaxVF < MinVF) {
22952 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22954 <<
"MinVF (" << MinVF <<
")\n");
22958 SmallVector<unsigned> CandidateVFs;
22959 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
22964 unsigned Repeat = 0;
22965 constexpr unsigned MaxAttempts = 4;
22966 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(
Operands.size());
22967 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
22968 P.first =
P.second = 1;
22969 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
22970 auto IsNotVectorized = [](
bool First,
22971 const std::pair<unsigned, unsigned> &
P) {
22972 return First ?
P.first > 0 :
P.second > 0;
22974 auto IsVectorized = [](
bool First,
22975 const std::pair<unsigned, unsigned> &
P) {
22976 return First ?
P.first == 0 :
P.second == 0;
22978 auto VFIsProfitable = [](
bool First,
unsigned Size,
22979 const std::pair<unsigned, unsigned> &
P) {
22982 auto FirstSizeSame = [](
unsigned Size,
22983 const std::pair<unsigned, unsigned> &
P) {
22984 return Size ==
P.first;
22988 bool RepeatChanged =
false;
22989 bool AnyProfitableGraph =
false;
22990 for (
unsigned VF : CandidateVFs) {
22991 AnyProfitableGraph =
false;
22992 unsigned FirstUnvecStore =
22993 std::distance(RangeSizes.begin(),
22994 find_if(RangeSizes, std::bind(IsNotVectorized,
22995 VF >= MaxRegVF, _1)));
22999 while (FirstUnvecStore < End) {
23000 unsigned FirstVecStore = std::distance(
23001 RangeSizes.begin(),
23002 find_if(RangeSizes.drop_front(FirstUnvecStore),
23003 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23004 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23005 for (
unsigned SliceStartIdx = FirstUnvecStore;
23006 SliceStartIdx + VF <= MaxSliceEnd;) {
23017 ->getValueOperand()
23020 ->getValueOperand()
23023 "Expected all operands of same type.");
23024 if (!NonSchedulable.
empty()) {
23025 auto [NonSchedSizeMax, NonSchedSizeMin] =
23027 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23030 SliceStartIdx += NonSchedSizeMax;
23035 std::optional<bool> Res =
23036 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23042 .first->getSecond()
23050 AnyProfitableGraph = RepeatChanged =
Changed =
true;
23053 for (std::pair<unsigned, unsigned> &
P :
23054 RangeSizes.slice(SliceStartIdx, VF))
23055 P.first =
P.second = 0;
23056 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23057 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
23058 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23059 P.first =
P.second = 0;
23060 FirstUnvecStore = SliceStartIdx + VF;
23062 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23063 for (std::pair<unsigned, unsigned> &
P :
23064 RangeSizes.slice(SliceStartIdx + VF,
23065 MaxSliceEnd - (SliceStartIdx + VF)))
23066 P.first =
P.second = 0;
23067 if (MaxSliceEnd == End)
23068 End = SliceStartIdx;
23069 MaxSliceEnd = SliceStartIdx;
23071 SliceStartIdx += VF;
23074 if (VF > 2 && Res &&
23075 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
23076 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23078 SliceStartIdx += VF;
23083 if (VF > MaxRegVF && TreeSize > 1 &&
23084 all_of(RangeSizes.slice(SliceStartIdx, VF),
23085 std::bind(FirstSizeSame, TreeSize, _1))) {
23086 SliceStartIdx += VF;
23087 while (SliceStartIdx != MaxSliceEnd &&
23088 RangeSizes[SliceStartIdx].first == TreeSize)
23092 if (TreeSize > 1) {
23093 for (std::pair<unsigned, unsigned> &
P :
23094 RangeSizes.slice(SliceStartIdx, VF)) {
23095 if (VF >= MaxRegVF)
23096 P.second = std::max(
P.second, TreeSize);
23098 P.first = std::max(
P.first, TreeSize);
23102 AnyProfitableGraph =
true;
23104 if (FirstUnvecStore >= End)
23106 if (MaxSliceEnd - FirstUnvecStore < VF &&
23107 MaxSliceEnd - FirstUnvecStore >= MinVF)
23108 AnyProfitableGraph =
true;
23109 FirstUnvecStore = std::distance(
23110 RangeSizes.begin(),
23111 find_if(RangeSizes.drop_front(MaxSliceEnd),
23112 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23114 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23118 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23119 return P.first == 0 &&
P.second == 0;
23123 if (Repeat >= MaxAttempts ||
23124 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23126 constexpr unsigned StoresLimit = 64;
23127 const unsigned MaxTotalNum = std::min<unsigned>(
23129 static_cast<unsigned>(
23132 RangeSizes.begin(),
23133 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23135 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23138 CandidateVFs.clear();
23140 CandidateVFs.push_back(Limit);
23141 if (VF > MaxTotalNum || VF >= StoresLimit)
23143 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23145 P.first = std::max(
P.second,
P.first);
23149 CandidateVFs.push_back(VF);
23189 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
23190 std::optional<int64_t> PtrDist;
23191 auto *RelatedStores =
find_if(
23192 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23193 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23194 return PtrDist.has_value();
23198 if (RelatedStores == SortedStores.
end()) {
23206 if (std::optional<unsigned> PrevInst =
23207 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23208 TryToVectorize(RelatedStores->getStores());
23209 RelatedStores->clearVectorizedStores(VectorizedStores);
23210 RelatedStores->rebase(*PrevInst + 1,
23215 Type *PrevValTy =
nullptr;
23217 if (
R.isDeleted(SI))
23220 PrevValTy =
SI->getValueOperand()->getType();
23222 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23223 for (RelatedStoreInsts &StoreSeq : SortedStores)
23224 TryToVectorize(StoreSeq.getStores());
23225 SortedStores.clear();
23226 PrevValTy =
SI->getValueOperand()->getType();
23228 FillStoresSet(
I, SI);
23232 for (RelatedStoreInsts &StoreSeq : SortedStores)
23233 TryToVectorize(StoreSeq.getStores());
23238void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23246 for (Instruction &
I : *BB) {
23250 if (!
SI->isSimple())
23261 if (
GEP->getNumIndices() != 1)
23263 Value *Idx =
GEP->idx_begin()->get();
23268 if (
GEP->getType()->isVectorTy())
23280 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23281 << VL.
size() <<
".\n");
23292 for (
Value *V : VL) {
23293 Type *Ty =
V->getType();
23297 R.getORE()->emit([&]() {
23298 std::string TypeStr;
23299 llvm::raw_string_ostream OS(TypeStr);
23301 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
23302 <<
"Cannot SLP vectorize list: type "
23303 << TypeStr +
" is unsupported by vectorizer";
23310 unsigned Sz =
R.getVectorElementSize(I0);
23311 unsigned MinVF =
R.getMinVF(Sz);
23312 unsigned MaxVF = std::max<unsigned>(
23314 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23316 R.getORE()->emit([&]() {
23317 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
23318 <<
"Cannot SLP vectorize list: vectorization factor "
23319 <<
"less than 2 is not supported";
23325 bool CandidateFound =
false;
23328 unsigned NextInst = 0, MaxInst = VL.size();
23329 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23335 if (TTI->getNumberOfParts(VecTy) == VF)
23337 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23338 unsigned ActualVF = std::min(MaxInst -
I, VF);
23343 if (MaxVFOnly && ActualVF < MaxVF)
23345 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23350 for (
Value *V : VL.drop_front(
I)) {
23354 !Inst || !
R.isDeleted(Inst)) {
23357 if (Idx == ActualVF)
23362 if (Idx != ActualVF)
23365 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23369 if (
R.isTreeTinyAndNotFullyVectorizable())
23371 if (
R.isProfitableToReorder()) {
23372 R.reorderTopToBottom();
23375 R.transformNodes();
23376 R.buildExternalUses();
23378 R.computeMinimumValueSizes();
23380 CandidateFound =
true;
23381 MinCost = std::min(MinCost,
Cost);
23384 <<
" for VF=" << ActualVF <<
"\n");
23387 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
23389 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23390 <<
" and with tree size "
23391 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23402 if (!
Changed && CandidateFound) {
23403 R.getORE()->emit([&]() {
23404 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
23405 <<
"List vectorization was possible but not beneficial with cost "
23406 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23410 R.getORE()->emit([&]() {
23411 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
23412 <<
"Cannot SLP vectorize list: vectorization was impossible"
23413 <<
" with available vectorization factors";
23448 using ReductionOpsType = SmallVector<Value *, 16>;
23449 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23450 ReductionOpsListType ReductionOps;
23454 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23455 WeakTrackingVH ReductionRoot;
23460 bool IsSupportedHorRdxIdentityOp =
false;
23467 static bool isCmpSelMinMax(Instruction *
I) {
23475 static bool isBoolLogicOp(Instruction *
I) {
23481 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
23482 bool TwoElementReduction =
false) {
23483 if (Kind == RecurKind::None)
23492 if (TwoElementReduction)
23495 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23499 return I->getFastMathFlags().noNaNs();
23502 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23505 return I->isAssociative();
23508 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
23514 return I->getOperand(2);
23515 return I->getOperand(Index);
23520 Value *
RHS,
const Twine &Name,
bool UseSelect) {
23524 case RecurKind::Or: {
23533 case RecurKind::And: {
23542 case RecurKind::Add:
23543 case RecurKind::Mul:
23544 case RecurKind::Xor:
23545 case RecurKind::FAdd:
23546 case RecurKind::FMul: {
23551 case RecurKind::SMax:
23552 case RecurKind::SMin:
23553 case RecurKind::UMax:
23554 case RecurKind::UMin:
23561 case RecurKind::FMax:
23562 case RecurKind::FMin:
23563 case RecurKind::FMaximum:
23564 case RecurKind::FMinimum:
23565 case RecurKind::FMaximumNum:
23566 case RecurKind::FMinimumNum: {
23579 const ReductionOpsListType &ReductionOps) {
23580 bool UseSelect = ReductionOps.size() == 2 ||
23582 (ReductionOps.size() == 1 &&
23584 assert((!UseSelect || ReductionOps.size() != 2 ||
23586 "Expected cmp + select pairs for reduction");
23587 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
23605 return RecurKind::None;
23607 return RecurKind::Add;
23609 return RecurKind::Mul;
23612 return RecurKind::And;
23615 return RecurKind::Or;
23617 return RecurKind::Xor;
23619 return RecurKind::FAdd;
23621 return RecurKind::FMul;
23624 return RecurKind::FMax;
23626 return RecurKind::FMin;
23629 return RecurKind::FMaximum;
23631 return RecurKind::FMinimum;
23637 return RecurKind::SMax;
23639 return RecurKind::SMin;
23641 return RecurKind::UMax;
23643 return RecurKind::UMin;
23669 return RecurKind::None;
23673 return RecurKind::None;
23676 return RecurKind::None;
23680 return RecurKind::None;
23685 return RecurKind::None;
23688 return RecurKind::SMax;
23691 return RecurKind::SMin;
23694 return RecurKind::UMax;
23697 return RecurKind::UMin;
23700 return RecurKind::None;
23704 static unsigned getFirstOperandIndex(Instruction *
I) {
23705 return isCmpSelMinMax(
I) ? 1 : 0;
23710 static unsigned getNumberOfOperands(Instruction *
I) {
23711 return isCmpSelMinMax(
I) ? 3 : 2;
23716 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
23717 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
23720 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
23722 return I->getParent() == BB;
23726 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
23727 if (IsCmpSelMinMax) {
23731 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23732 return I->hasNUses(2);
23740 void initReductionOps(Instruction *
I) {
23741 if (isCmpSelMinMax(
I))
23742 ReductionOps.assign(2, ReductionOpsType());
23744 ReductionOps.assign(1, ReductionOpsType());
23748 void addReductionOps(Instruction *
I) {
23749 if (isCmpSelMinMax(
I)) {
23751 ReductionOps[1].emplace_back(
I);
23753 ReductionOps[0].emplace_back(
I);
23758 int Sz =
Data.size();
23767 : ReductionRoot(
I), ReductionLimit(2) {
23768 RdxKind = HorizontalReduction::getRdxKind(
I);
23769 ReductionOps.emplace_back().push_back(
I);
23772 ReducedValsToOps[
V].push_back(
I);
23775 bool matchReductionForOperands()
const {
23778 assert(ReductionRoot &&
"Reduction root is not set!");
23781 return Ops.size() == 2;
23789 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23790 ScalarEvolution &SE,
const DataLayout &
DL,
23791 const TargetLibraryInfo &TLI) {
23792 RdxKind = HorizontalReduction::getRdxKind(Root);
23793 if (!isVectorizable(RdxKind, Root))
23805 if (!Sel->getCondition()->hasOneUse())
23808 ReductionRoot = Root;
23813 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23815 1, std::make_pair(Root, 0));
23820 SmallVectorImpl<Value *> &PossibleReducedVals,
23821 SmallVectorImpl<Instruction *> &ReductionOps,
23824 getNumberOfOperands(TreeN)))) {
23825 Value *EdgeVal = getRdxOperand(TreeN,
I);
23826 ReducedValsToOps[EdgeVal].push_back(TreeN);
23834 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23835 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23836 !isVectorizable(RdxKind, EdgeInst) ||
23837 (
R.isAnalyzedReductionRoot(EdgeInst) &&
23839 PossibleReducedVals.push_back(EdgeVal);
23842 ReductionOps.push_back(EdgeInst);
23851 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
23853 PossibleReducedVals;
23854 initReductionOps(Root);
23856 SmallSet<size_t, 2> LoadKeyUsed;
23858 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
23863 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
23864 if (LIt != LoadsMap.
end()) {
23865 for (LoadInst *RLI : LIt->second) {
23871 for (LoadInst *RLI : LIt->second) {
23878 if (LIt->second.size() > 2) {
23880 hash_value(LIt->second.back()->getPointerOperand());
23886 .first->second.push_back(LI);
23890 while (!Worklist.empty()) {
23891 auto [TreeN,
Level] = Worklist.pop_back_val();
23894 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23895 addReductionOps(TreeN);
23898 for (
Value *V : PossibleRedVals) {
23902 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
23904 for (Instruction *
I :
reverse(PossibleReductionOps))
23905 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
23907 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
23910 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
23911 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
23913 for (
auto &Slice : PossibleRedVals) {
23915 auto RedValsVect = Slice.second.takeVector();
23917 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
23918 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
23920 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
23921 return P1.size() > P2.size();
23928 }
else if (!isGoodForReduction(
Data)) {
23931 if (!LI || !LastLI ||
23936 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
23942 return P1.size() > P2.
size();
23948 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
23949 const TargetLibraryInfo &TLI, AssumptionCache *AC,
23950 DominatorTree &DT) {
23951 constexpr unsigned RegMaxNumber = 4;
23952 constexpr unsigned RedValsMaxNumber = 128;
23956 if (
unsigned NumReducedVals = std::accumulate(
23957 ReducedVals.
begin(), ReducedVals.
end(), 0,
23959 if (!isGoodForReduction(Vals))
23961 return Num + Vals.size();
23963 NumReducedVals < ReductionLimit &&
23967 for (ReductionOpsType &RdxOps : ReductionOps)
23968 for (
Value *RdxOp : RdxOps)
23973 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
23979 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
23980 ReducedVals.
front().size());
23984 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
23986 "Expected min/max reduction to have select root instruction");
23989 "Expected min/max reduction to have compare condition");
23993 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
23994 return isBoolLogicOp(cast<Instruction>(V));
23997 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
23998 if (VectorizedTree) {
24002 if (AnyBoolLogicOp) {
24003 auto It = ReducedValsToOps.
find(VectorizedTree);
24004 auto It1 = ReducedValsToOps.
find(Res);
24005 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
24007 (It != ReducedValsToOps.
end() &&
24008 any_of(It->getSecond(), [&](Instruction *
I) {
24009 return isBoolLogicOp(I) &&
24010 getRdxOperand(I, 0) == VectorizedTree;
24014 (It1 != ReducedValsToOps.
end() &&
24015 any_of(It1->getSecond(), [&](Instruction *
I) {
24016 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24020 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
24024 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
24030 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24031 ReductionOps.front().size());
24032 for (ReductionOpsType &RdxOps : ReductionOps)
24033 for (
Value *RdxOp : RdxOps) {
24036 IgnoreList.insert(RdxOp);
24039 FastMathFlags RdxFMF;
24041 for (
Value *U : IgnoreList)
24043 RdxFMF &= FPMO->getFastMathFlags();
24049 for (
Value *V : Candidates)
24050 TrackedVals.try_emplace(V, V);
24052 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24053 Value *
V) ->
unsigned & {
24054 auto *It = MV.
find(V);
24055 assert(It != MV.
end() &&
"Unable to find given key.");
24059 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
24062 SmallPtrSet<Value *, 4> RequiredExtract;
24063 WeakTrackingVH VectorizedTree =
nullptr;
24064 bool CheckForReusedReductionOps =
false;
24069 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
24071 InstructionsState S = States[
I];
24074 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
24075 for (
Value *ReducedVal : OrigReducedVals) {
24076 Value *RdxVal = TrackedVals.at(ReducedVal);
24083 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24087 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24089 bool ShuffledExtracts =
false;
24091 if (S && S.getOpcode() == Instruction::ExtractElement &&
24092 !S.isAltShuffle() &&
I + 1 <
E) {
24094 for (
Value *RV : ReducedVals[
I + 1]) {
24095 Value *RdxVal = TrackedVals.at(RV);
24102 CommonCandidates.push_back(RdxVal);
24103 TrackedToOrig.try_emplace(RdxVal, RV);
24105 SmallVector<int>
Mask;
24108 Candidates.
swap(CommonCandidates);
24109 ShuffledExtracts =
true;
24116 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24117 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24119 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24120 Value *OrigV = TrackedToOrig.at(VC);
24121 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24123 V.analyzedReductionRoot(ResI);
24125 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24129 unsigned NumReducedVals = Candidates.
size();
24130 if (NumReducedVals < ReductionLimit &&
24131 (NumReducedVals < 2 || !
isSplat(Candidates)))
24136 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24137 RdxKind != RecurKind::FMul &&
24138 RdxKind != RecurKind::FMulAdd;
24140 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24141 if (IsSupportedHorRdxIdentityOp)
24142 for (
Value *V : Candidates) {
24143 Value *OrigV = TrackedToOrig.at(V);
24144 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24156 bool SameScaleFactor =
false;
24157 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24158 SameValuesCounter.
size() != Candidates.size();
24160 if (OptReusedScalars) {
24162 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24163 RdxKind == RecurKind::Xor) &&
24165 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24166 return P.second == SameValuesCounter.
front().second;
24168 Candidates.resize(SameValuesCounter.
size());
24169 transform(SameValuesCounter, Candidates.begin(),
24170 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24171 NumReducedVals = Candidates.size();
24173 if (NumReducedVals == 1) {
24174 Value *OrigV = TrackedToOrig.at(Candidates.front());
24175 unsigned Cnt = At(SameValuesCounter, OrigV);
24177 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24178 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24179 VectorizedVals.try_emplace(OrigV, Cnt);
24180 ExternallyUsedValues.
insert(OrigV);
24185 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24186 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24187 const unsigned MaxElts = std::clamp<unsigned>(
24189 RegMaxNumber * RedValsMaxNumber);
24191 unsigned ReduxWidth = NumReducedVals;
24192 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24193 unsigned NumParts, NumRegs;
24194 Type *ScalarTy = Candidates.front()->getType();
24201 while (NumParts > NumRegs) {
24202 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24203 ReduxWidth =
bit_floor(ReduxWidth - 1);
24209 if (NumParts > NumRegs / 2)
24214 ReduxWidth = GetVectorFactor(ReduxWidth);
24215 ReduxWidth = std::min(ReduxWidth, MaxElts);
24217 unsigned Start = 0;
24218 unsigned Pos =
Start;
24220 unsigned PrevReduxWidth = ReduxWidth;
24221 bool CheckForReusedReductionOpsLocal =
false;
24222 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24223 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24224 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24227 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24230 if (Pos < NumReducedVals - ReduxWidth + 1)
24231 return IsAnyRedOpGathered;
24234 if (ReduxWidth > 1)
24235 ReduxWidth = GetVectorFactor(ReduxWidth);
24236 return IsAnyRedOpGathered;
24238 bool AnyVectorized =
false;
24239 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24240 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24241 ReduxWidth >= ReductionLimit) {
24244 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24246 CheckForReusedReductionOps =
true;
24249 PrevReduxWidth = ReduxWidth;
24252 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24255 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24257 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24259 V.areAnalyzedReductionVals(VL)) {
24260 (void)AdjustReducedVals(
true);
24267 return RedValI &&
V.isDeleted(RedValI);
24270 V.buildTree(VL, IgnoreList);
24271 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24272 if (!AdjustReducedVals())
24273 V.analyzedReductionVals(VL);
24276 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24277 if (!AdjustReducedVals())
24278 V.analyzedReductionVals(VL);
24281 V.reorderTopToBottom();
24284 VL.front()->getType()->isIntOrIntVectorTy() ||
24285 ReductionLimit > 2);
24289 ExternallyUsedValues);
24293 LocalExternallyUsedValues.insert(ReductionRoot);
24294 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24295 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24297 for (
Value *V : ReducedVals[Cnt])
24299 LocalExternallyUsedValues.insert(TrackedVals[V]);
24301 if (!IsSupportedHorRdxIdentityOp) {
24304 "Reused values counter map is not empty");
24305 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24306 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24308 Value *
V = Candidates[Cnt];
24309 Value *OrigV = TrackedToOrig.at(V);
24310 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24313 V.transformNodes();
24316 SmallPtrSet<Value *, 4> Visited;
24317 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24318 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24320 Value *RdxVal = Candidates[Cnt];
24321 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24322 RdxVal = It->second;
24323 if (!Visited.
insert(RdxVal).second)
24327 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24328 LocalExternallyUsedValues.insert(RdxVal);
24331 Value *OrigV = TrackedToOrig.at(RdxVal);
24333 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24334 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
24335 LocalExternallyUsedValues.insert(RdxVal);
24338 if (!IsSupportedHorRdxIdentityOp)
24339 SameValuesCounter.
clear();
24340 for (
Value *RdxVal : VL)
24341 if (RequiredExtract.
contains(RdxVal))
24342 LocalExternallyUsedValues.insert(RdxVal);
24343 V.buildExternalUses(LocalExternallyUsedValues);
24345 V.computeMinimumValueSizes();
24349 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
24352 <<
" for reduction\n");
24356 V.getORE()->emit([&]() {
24357 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
24358 ReducedValsToOps.
at(VL[0]).front())
24359 <<
"Vectorizing horizontal reduction is possible "
24360 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24361 <<
" and threshold "
24364 if (!AdjustReducedVals()) {
24365 V.analyzedReductionVals(VL);
24367 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24370 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24371 VF >= ReductionLimit;
24373 *
TTI, VL.front()->getType(), VF - 1)) {
24375 V.getCanonicalGraphSize() !=
V.getTreeSize())
24378 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
24385 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24386 <<
Cost <<
". (HorRdx)\n");
24387 V.getORE()->emit([&]() {
24388 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
24389 ReducedValsToOps.
at(VL[0]).front())
24390 <<
"Vectorized horizontal reduction with cost "
24391 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24392 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24401 if (IsCmpSelMinMax)
24402 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24405 Value *VectorizedRoot =
V.vectorizeTree(
24406 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24409 for (
Value *RdxVal : Candidates) {
24410 Value *OrigVal = TrackedToOrig.at(RdxVal);
24411 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24412 if (TransformedRdxVal != RdxVal)
24413 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24422 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24425 if (OptReusedScalars && !SameScaleFactor) {
24426 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24427 SameValuesCounter, TrackedToOrig);
24430 Type *ScalarTy = VL.front()->getType();
24435 OptReusedScalars && SameScaleFactor
24436 ? SameValuesCounter.
front().second
24439 ?
V.isSignedMinBitwidthRootNode()
24443 for (
Value *RdxVal : VL) {
24444 Value *OrigV = TrackedToOrig.at(RdxVal);
24445 if (IsSupportedHorRdxIdentityOp) {
24446 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24449 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24450 if (!
V.isVectorized(RdxVal))
24451 RequiredExtract.
insert(RdxVal);
24455 ReduxWidth = NumReducedVals - Pos;
24456 if (ReduxWidth > 1)
24457 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24458 AnyVectorized =
true;
24460 if (OptReusedScalars && !AnyVectorized) {
24461 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24462 Value *RdxVal = TrackedVals.at(
P.first);
24463 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24464 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24465 VectorizedVals.try_emplace(
P.first,
P.second);
24470 if (!VectorValuesAndScales.
empty())
24471 VectorizedTree = GetNewVectorizedTree(
24473 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24475 if (!VectorizedTree) {
24476 if (!CheckForReusedReductionOps) {
24477 for (ReductionOpsType &RdxOps : ReductionOps)
24478 for (
Value *RdxOp : RdxOps)
24500 auto FixBoolLogicalOps =
24503 if (!AnyBoolLogicOp)
24505 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24506 getRdxOperand(RedOp1, 0) ==
LHS ||
24509 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24510 getRdxOperand(RedOp2, 0) ==
RHS ||
24515 if (
LHS != VectorizedTree)
24523 unsigned Sz = InstVals.
size();
24525 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24528 Value *RdxVal1 = InstVals[
I].second;
24529 Value *StableRdxVal1 = RdxVal1;
24530 auto It1 = TrackedVals.find(RdxVal1);
24531 if (It1 != TrackedVals.end())
24532 StableRdxVal1 = It1->second;
24533 Value *RdxVal2 = InstVals[
I + 1].second;
24534 Value *StableRdxVal2 = RdxVal2;
24535 auto It2 = TrackedVals.find(RdxVal2);
24536 if (It2 != TrackedVals.end())
24537 StableRdxVal2 = It2->second;
24541 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24543 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24544 StableRdxVal2,
"op.rdx", ReductionOps);
24545 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24548 ExtraReds[Sz / 2] = InstVals.
back();
24554 SmallPtrSet<Value *, 8> Visited;
24556 for (
Value *RdxVal : Candidates) {
24557 if (!Visited.
insert(RdxVal).second)
24559 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24560 for (Instruction *RedOp :
24566 bool InitStep =
true;
24567 while (ExtraReductions.
size() > 1) {
24569 FinalGen(ExtraReductions, InitStep);
24570 ExtraReductions.
swap(NewReds);
24573 VectorizedTree = ExtraReductions.
front().second;
24575 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24582 SmallPtrSet<Value *, 4> IgnoreSet;
24591 for (
auto *U :
Ignore->users()) {
24593 "All users must be either in the reduction ops list.");
24596 if (!
Ignore->use_empty()) {
24598 Ignore->replaceAllUsesWith(
P);
24601 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24603 return VectorizedTree;
24609 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24610 Value *Vec,
unsigned Scale,
bool IsSigned,
24634 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
24637 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
24639 if (Rdx->
getType() != DestTy)
24645 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24652 bool IsCmpSelMinMax, FastMathFlags FMF,
24653 const BoUpSLP &R, DominatorTree &DT,
24654 const DataLayout &
DL,
24655 const TargetLibraryInfo &TLI) {
24657 Type *ScalarTy = ReducedVals.
front()->getType();
24658 unsigned ReduxWidth = ReducedVals.
size();
24659 FixedVectorType *VectorTy =
R.getReductionType();
24664 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
24667 int Cnt = ReducedVals.
size();
24668 for (
Value *RdxVal : ReducedVals) {
24673 Cost += GenCostFn();
24677 for (User *U : RdxVal->
users()) {
24679 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24680 if (RdxKind == RecurKind::FAdd) {
24690 FMACost -= FMulCost;
24692 ScalarCost += FMACost;
24699 ScalarCost = InstructionCost::getInvalid();
24703 Cost += ScalarCost;
24705 Cost += GenCostFn();
24714 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
24716 case RecurKind::Add:
24717 case RecurKind::Mul:
24718 case RecurKind::Or:
24719 case RecurKind::And:
24720 case RecurKind::Xor:
24721 case RecurKind::FAdd:
24722 case RecurKind::FMul: {
24725 if (DoesRequireReductionOp) {
24728 unsigned ScalarTyNumElements = VecTy->getNumElements();
24733 ReducedVals.size()),
24744 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24745 std::make_pair(RedTy,
true));
24746 if (RType == RedTy) {
24751 RdxOpcode, !IsSigned, RedTy,
24757 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24758 std::make_pair(RedTy,
true));
24761 if (RdxKind == RecurKind::FAdd) {
24766 for (
Value *RdxVal : ReducedVals) {
24772 FMF &= FPCI->getFastMathFlags();
24775 if (!
Ops.empty()) {
24780 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24781 {RVecTy, RVecTy, RVecTy}, FMF);
24787 Instruction::FMul, RVecTy,
CostKind);
24789 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
24790 FMACost -= FMulCost;
24794 if (FMACost.isValid())
24795 VectorCost += FMACost;
24799 if (RType != RedTy) {
24800 unsigned Opcode = Instruction::Trunc;
24802 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24808 ScalarCost = EvaluateScalarCost([&]() {
24813 case RecurKind::FMax:
24814 case RecurKind::FMin:
24815 case RecurKind::FMaximum:
24816 case RecurKind::FMinimum:
24817 case RecurKind::SMax:
24818 case RecurKind::SMin:
24819 case RecurKind::UMax:
24820 case RecurKind::UMin: {
24823 if (DoesRequireReductionOp) {
24829 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24830 std::make_pair(RedTy,
true));
24832 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24834 if (RType != RedTy) {
24835 unsigned Opcode = Instruction::Trunc;
24837 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24843 ScalarCost = EvaluateScalarCost([&]() {
24844 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24853 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
24855 <<
" (It is a splitting reduction)\n");
24856 return VectorCost - ScalarCost;
24862 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24864 Value *ReducedSubTree =
nullptr;
24866 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
24867 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
24868 if (ReducedSubTree)
24869 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24870 "op.rdx", ReductionOps);
24872 ReducedSubTree = Rdx;
24874 if (VectorValuesAndScales.
size() == 1) {
24875 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
24876 CreateSingleOp(Vec, Scale, IsSigned);
24877 return ReducedSubTree;
24881 Value *VecRes =
nullptr;
24882 bool VecResSignedness =
false;
24883 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
24889 case RecurKind::Add: {
24890 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
24893 <<
". (HorRdx)\n");
24896 std::iota(std::next(
Mask.begin(), VF *
I),
24897 std::next(
Mask.begin(), VF * (
I + 1)), 0);
24898 ++NumVectorInstructions;
24909 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
24910 <<
". (HorRdx)\n");
24911 ++NumVectorInstructions;
24915 case RecurKind::Xor: {
24918 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
24923 case RecurKind::FAdd: {
24927 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
24928 <<
". (HorRdx)\n");
24929 ++NumVectorInstructions;
24933 case RecurKind::And:
24934 case RecurKind::Or:
24935 case RecurKind::SMax:
24936 case RecurKind::SMin:
24937 case RecurKind::UMax:
24938 case RecurKind::UMin:
24939 case RecurKind::FMax:
24940 case RecurKind::FMin:
24941 case RecurKind::FMaximum:
24942 case RecurKind::FMinimum:
24945 case RecurKind::Sub:
24946 case RecurKind::AddChainWithSubs:
24947 case RecurKind::Mul:
24948 case RecurKind::FMul:
24949 case RecurKind::FMulAdd:
24950 case RecurKind::AnyOf:
24951 case RecurKind::FindFirstIVSMin:
24952 case RecurKind::FindFirstIVUMin:
24953 case RecurKind::FindLastIVSMax:
24954 case RecurKind::FindLastIVUMax:
24955 case RecurKind::FMaxNum:
24956 case RecurKind::FMinNum:
24957 case RecurKind::FMaximumNum:
24958 case RecurKind::FMinimumNum:
24959 case RecurKind::None:
24966 VecResSignedness = IsSigned;
24968 ++NumVectorInstructions;
24969 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
24975 std::iota(
Mask.begin(),
Mask.end(), 0);
24977 if (VecResVF < VecVF) {
24981 if (VecResVF != VecVF) {
24983 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25000 if (VecResVF < VecVF) {
25006 if (VecResVF != VecVF)
25008 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
25009 if (VecResVF != VecVF)
25014 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25015 CreateVecOp(Vec, Scale, IsSigned);
25016 CreateSingleOp(VecRes, 1,
false);
25018 return ReducedSubTree;
25022 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
25023 const TargetTransformInfo *
TTI,
Type *DestTy) {
25024 assert(VectorizedValue &&
"Need to have a vectorized tree node");
25025 assert(RdxKind != RecurKind::FMulAdd &&
25026 "A call to the llvm.fmuladd intrinsic is not handled yet");
25029 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
25030 RdxKind == RecurKind::Add &&
25035 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
25036 ++NumVectorInstructions;
25039 ++NumVectorInstructions;
25044 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
25046 assert(IsSupportedHorRdxIdentityOp &&
25047 "The optimization of matched scalar identity horizontal reductions "
25048 "must be supported.");
25050 return VectorizedValue;
25052 case RecurKind::Add: {
25054 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
25056 << VectorizedValue <<
". (HorRdx)\n");
25057 return Builder.
CreateMul(VectorizedValue, Scale);
25059 case RecurKind::Xor: {
25061 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
25062 <<
". (HorRdx)\n");
25065 return VectorizedValue;
25067 case RecurKind::FAdd: {
25069 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
25071 << VectorizedValue <<
". (HorRdx)\n");
25072 return Builder.
CreateFMul(VectorizedValue, Scale);
25074 case RecurKind::And:
25075 case RecurKind::Or:
25076 case RecurKind::SMax:
25077 case RecurKind::SMin:
25078 case RecurKind::UMax:
25079 case RecurKind::UMin:
25080 case RecurKind::FMax:
25081 case RecurKind::FMin:
25082 case RecurKind::FMaximum:
25083 case RecurKind::FMinimum:
25085 return VectorizedValue;
25086 case RecurKind::Sub:
25087 case RecurKind::AddChainWithSubs:
25088 case RecurKind::Mul:
25089 case RecurKind::FMul:
25090 case RecurKind::FMulAdd:
25091 case RecurKind::AnyOf:
25092 case RecurKind::FindFirstIVSMin:
25093 case RecurKind::FindFirstIVUMin:
25094 case RecurKind::FindLastIVSMax:
25095 case RecurKind::FindLastIVUMax:
25096 case RecurKind::FMaxNum:
25097 case RecurKind::FMinNum:
25098 case RecurKind::FMaximumNum:
25099 case RecurKind::FMinimumNum:
25100 case RecurKind::None:
25109 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25110 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25111 const DenseMap<Value *, Value *> &TrackedToOrig) {
25112 assert(IsSupportedHorRdxIdentityOp &&
25113 "The optimization of matched scalar identity horizontal reductions "
25114 "must be supported.");
25117 if (VTy->getElementType() != VL.
front()->getType()) {
25121 R.isSignedMinBitwidthRootNode());
25124 case RecurKind::Add: {
25127 for (
Value *V : VL) {
25128 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25129 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
25133 << VectorizedValue <<
". (HorRdx)\n");
25134 return Builder.
CreateMul(VectorizedValue, Scale);
25136 case RecurKind::And:
25137 case RecurKind::Or:
25140 <<
". (HorRdx)\n");
25141 return VectorizedValue;
25142 case RecurKind::SMax:
25143 case RecurKind::SMin:
25144 case RecurKind::UMax:
25145 case RecurKind::UMin:
25146 case RecurKind::FMax:
25147 case RecurKind::FMin:
25148 case RecurKind::FMaximum:
25149 case RecurKind::FMinimum:
25152 <<
". (HorRdx)\n");
25153 return VectorizedValue;
25154 case RecurKind::Xor: {
25159 SmallVector<int>
Mask(
25162 std::iota(
Mask.begin(),
Mask.end(), 0);
25163 bool NeedShuffle =
false;
25164 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25166 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25167 if (Cnt % 2 == 0) {
25169 NeedShuffle =
true;
25175 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25179 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
25180 return VectorizedValue;
25182 case RecurKind::FAdd: {
25185 for (
Value *V : VL) {
25186 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25187 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25190 return Builder.
CreateFMul(VectorizedValue, Scale);
25192 case RecurKind::Sub:
25193 case RecurKind::AddChainWithSubs:
25194 case RecurKind::Mul:
25195 case RecurKind::FMul:
25196 case RecurKind::FMulAdd:
25197 case RecurKind::AnyOf:
25198 case RecurKind::FindFirstIVSMin:
25199 case RecurKind::FindFirstIVUMin:
25200 case RecurKind::FindLastIVSMax:
25201 case RecurKind::FindLastIVUMax:
25202 case RecurKind::FMaxNum:
25203 case RecurKind::FMinNum:
25204 case RecurKind::FMaximumNum:
25205 case RecurKind::FMinimumNum:
25206 case RecurKind::None:
25216 return HorizontalReduction::getRdxKind(V);
25222 unsigned AggregateSize = 1;
25224 Type *CurrentType =
IV->getType();
25227 for (
auto *Elt : ST->elements())
25228 if (Elt != ST->getElementType(0))
25229 return std::nullopt;
25230 AggregateSize *= ST->getNumElements();
25231 CurrentType = ST->getElementType(0);
25233 AggregateSize *= AT->getNumElements();
25234 CurrentType = AT->getElementType();
25236 AggregateSize *= VT->getNumElements();
25237 return AggregateSize;
25239 return AggregateSize;
25241 return std::nullopt;
25250 unsigned OperandOffset,
const BoUpSLP &R) {
25253 std::optional<unsigned> OperandIndex =
25255 if (!OperandIndex || R.isDeleted(LastInsertInst))
25259 BuildVectorOpds, InsertElts, *OperandIndex, R);
25262 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25263 InsertElts[*OperandIndex] = LastInsertInst;
25266 }
while (LastInsertInst !=
nullptr &&
25293 "Expected insertelement or insertvalue instruction!");
25296 "Expected empty result vectors!");
25299 if (!AggregateSize)
25301 BuildVectorOpds.
resize(*AggregateSize);
25302 InsertElts.
resize(*AggregateSize);
25307 if (BuildVectorOpds.
size() >= 2)
25325 auto DominatedReduxValue = [&](
Value *R) {
25333 if (
P->getIncomingBlock(0) == ParentBB) {
25335 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25339 if (Rdx && DominatedReduxValue(Rdx))
25352 if (
P->getIncomingBlock(0) == BBLatch) {
25354 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25358 if (Rdx && DominatedReduxValue(Rdx))
25394 "Expected binop, select, or intrinsic for reduction matching");
25396 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25398 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25409 Value *Op0 =
nullptr;
25410 Value *Op1 =
nullptr;
25419 Value *B0 =
nullptr, *B1 =
nullptr;
25424bool SLPVectorizerPass::vectorizeHorReduction(
25425 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
25426 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25435 auto SelectRoot = [&]() {
25454 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25455 Stack.emplace(SelectRoot(), 0);
25456 SmallPtrSet<Value *, 8> VisitedInstrs;
25459 if (
R.isAnalyzedReductionRoot(Inst))
25464 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25466 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25468 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25469 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25481 while (!
Stack.empty()) {
25484 std::tie(Inst, Level) =
Stack.front();
25489 if (
R.isDeleted(Inst))
25491 if (
Value *VectorizedV = TryToReduce(Inst)) {
25495 Stack.emplace(
I, Level);
25498 if (
R.isDeleted(Inst))
25502 if (!TryAppendToPostponedInsts(Inst)) {
25513 if (VisitedInstrs.
insert(
Op).second)
25518 !
R.isDeleted(
I) &&
I->getParent() == BB)
25519 Stack.emplace(
I, Level);
25524bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
25531 if ((
I->getOpcode() == Instruction::FAdd ||
25532 I->getOpcode() == Instruction::FSub) &&
25542 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25543 R.isDeleted(Op0) ||
R.isDeleted(Op1))
25553 if (
A &&
B &&
B->hasOneUse()) {
25556 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
25558 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
25562 if (
B &&
A &&
A->hasOneUse()) {
25565 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
25567 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
25571 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
25575 Type *Ty = Inst->getType();
25579 if (!HorRdx.matchReductionForOperands())
25585 TTI.getScalarizationOverhead(
25588 TTI.getInstructionCost(Inst,
CostKind);
25600 FMF = FPCI->getFastMathFlags();
25601 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25608 if (RedCost >= ScalarCost)
25611 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
25613 if (Candidates.
size() == 1)
25614 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
25617 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
25618 if (!BestCandidate)
25620 return (*BestCandidate == 0 &&
25621 TryToReduce(
I, {Candidates[*BestCandidate].first,
25622 Candidates[*BestCandidate].second})) ||
25623 tryToVectorizeList({Candidates[*BestCandidate].first,
25624 Candidates[*BestCandidate].second},
25628bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
25629 BasicBlock *BB,
BoUpSLP &R) {
25631 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
25632 Res |= tryToVectorize(PostponedInsts, R);
25639 for (
Value *V : Insts)
25641 Res |= tryToVectorize(Inst, R);
25645bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25648 if (!
R.canMapToVector(IVI->
getType()))
25651 SmallVector<Value *, 16> BuildVectorOpds;
25652 SmallVector<Value *, 16> BuildVectorInsts;
25656 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
25657 R.getORE()->emit([&]() {
25658 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
25659 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
25660 "trying reduction first.";
25664 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
25666 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25669bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25672 SmallVector<Value *, 16> BuildVectorInsts;
25673 SmallVector<Value *, 16> BuildVectorOpds;
25674 SmallVector<int>
Mask;
25680 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
25681 R.getORE()->emit([&]() {
25682 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
25683 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
25684 "trying reduction first.";
25688 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
25689 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25692template <
typename T>
25697 bool MaxVFOnly,
BoUpSLP &R) {
25710 if (!
I || R.isDeleted(
I)) {
25714 auto *SameTypeIt = IncIt;
25717 AreCompatible(VL, *SameTypeIt))) {
25720 if (
I && !R.isDeleted(
I))
25725 unsigned NumElts = VL.
size();
25726 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
25727 << NumElts <<
")\n");
25737 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
25740 VL.
swap(Candidates);
25741 Candidates.
clear();
25749 auto GetMinNumElements = [&R](
Value *V) {
25750 unsigned EltSize = R.getVectorElementSize(V);
25751 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25753 if (NumElts < GetMinNumElements(*IncIt) &&
25754 (Candidates.
empty() ||
25755 Candidates.
front()->getType() == (*IncIt)->getType())) {
25763 if (Candidates.
size() > 1 &&
25764 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25765 if (TryToVectorizeHelper(Candidates,
false)) {
25768 }
else if (MaxVFOnly) {
25771 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
25774 if (!
I || R.isDeleted(
I)) {
25778 auto *SameTypeIt = It;
25779 while (SameTypeIt != End &&
25782 AreCompatible(*SameTypeIt, *It))) {
25785 if (
I && !R.isDeleted(
I))
25788 unsigned NumElts = VL.
size();
25789 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
25795 Candidates.
clear();
25799 IncIt = SameTypeIt;
25811template <
bool IsCompatibility>
25816 "Expected valid element types only.");
25818 return IsCompatibility;
25821 if (CI1->getOperand(0)->getType()->getTypeID() <
25823 return !IsCompatibility;
25824 if (CI1->getOperand(0)->getType()->getTypeID() >
25827 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25829 return !IsCompatibility;
25830 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25839 if (BasePred1 < BasePred2)
25840 return !IsCompatibility;
25841 if (BasePred1 > BasePred2)
25844 bool CI1Preds = Pred1 == BasePred1;
25845 bool CI2Preds = Pred2 == BasePred1;
25846 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
25847 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
25852 return !IsCompatibility;
25857 if (IsCompatibility) {
25858 if (I1->getParent() != I2->getParent())
25865 return NodeI2 !=
nullptr;
25868 assert((NodeI1 == NodeI2) ==
25870 "Different nodes should have different DFS numbers");
25871 if (NodeI1 != NodeI2)
25875 if (S && (IsCompatibility || !S.isAltShuffle()))
25877 if (IsCompatibility)
25879 if (I1->getOpcode() != I2->getOpcode())
25880 return I1->getOpcode() < I2->getOpcode();
25883 return IsCompatibility;
25886template <
typename ItT>
25888 BasicBlock *BB,
BoUpSLP &R) {
25891 for (CmpInst *
I : CmpInsts) {
25892 if (
R.isDeleted(
I))
25896 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
25897 if (
R.isDeleted(
I))
25902 for (CmpInst *
I : CmpInsts) {
25903 if (
R.isDeleted(
I))
25922 for (Instruction *V : CmpInsts)
25925 if (Vals.
size() <= 1)
25928 Vals, CompareSorter, AreCompatibleCompares,
25931 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
25932 return any_of(
V->users(), [V](User *U) {
25933 auto *Select = dyn_cast<SelectInst>(U);
25935 Select->getParent() != cast<Instruction>(V)->getParent();
25938 if (ArePossiblyReducedInOtherBlock)
25940 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25946bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25947 BasicBlock *BB,
BoUpSLP &R) {
25949 "This function only accepts Insert instructions");
25950 bool OpsChanged =
false;
25952 for (
auto *
I :
reverse(Instructions)) {
25958 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
25961 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
25964 if (
R.isDeleted(
I))
25966 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
25972 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
25974 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
25979 OpsChanged |= tryToVectorize(PostponedInsts, R);
25985bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
25988 SmallPtrSet<Value *, 16> VisitedInstrs;
25992 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
25993 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
25996 "Expected vectorizable types only.");
26006 V2->getType()->getScalarSizeInBits())
26009 V2->getType()->getScalarSizeInBits())
26013 if (Opcodes1.
size() < Opcodes2.
size())
26015 if (Opcodes1.
size() > Opcodes2.
size())
26017 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26026 return NodeI2 !=
nullptr;
26029 assert((NodeI1 == NodeI2) ==
26031 "Different nodes should have different DFS numbers");
26032 if (NodeI1 != NodeI2)
26035 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
26051 DT->getNode(V1->getParent());
26053 DT->getNode(V2->getParent());
26055 return NodeI2 !=
nullptr;
26058 assert((NodeI1 == NodeI2) ==
26060 "Different nodes should have different DFS numbers");
26061 if (NodeI1 != NodeI2)
26063 return V1->comesBefore(V2);
26076 return *Id1 < *Id2;
26080 if (
I1->getOpcode() == I2->getOpcode())
26082 return I1->getOpcode() < I2->getOpcode();
26105 auto ValID1 = Opcodes1[
I]->getValueID();
26106 auto ValID2 = Opcodes2[
I]->getValueID();
26107 if (ValID1 == ValID2)
26109 if (ValID1 < ValID2)
26111 if (ValID1 > ValID2)
26120 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
26126 if (VL.empty() || V1 == VL.back())
26128 Value *V2 = VL.back();
26133 if (Opcodes1.
size() != Opcodes2.
size())
26135 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26141 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
26143 if (
I1->getParent() != I2->getParent())
26151 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26157 bool HaveVectorizedPhiNodes =
false;
26161 for (Instruction &
I : *BB) {
26168 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26173 if (Incoming.
size() <= 1)
26178 for (
Value *V : Incoming) {
26179 SmallVectorImpl<Value *> &Opcodes =
26181 if (!Opcodes.
empty())
26184 SmallPtrSet<Value *, 4> Visited;
26185 while (!Nodes.empty()) {
26189 for (
Value *V :
PHI->incoming_values()) {
26191 Nodes.push_back(PHI1);
26200 Incoming, PHICompare, AreCompatiblePHIs,
26202 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26205 Changed |= HaveVectorizedPhiNodes;
26206 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26208 return !
PHI ||
R.isDeleted(
PHI);
26210 PHIToOpcodes.
clear();
26212 }
while (HaveVectorizedPhiNodes);
26214 VisitedInstrs.
clear();
26216 InstSetVector PostProcessInserts;
26217 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26220 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26221 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26222 if (VectorizeCmps) {
26224 PostProcessCmps.
clear();
26226 PostProcessInserts.clear();
26232 return PostProcessCmps.
contains(Cmp);
26234 PostProcessInserts.contains(
I);
26240 return I->use_empty() &&
26250 if (
R.isDeleted(&*It))
26253 if (!VisitedInstrs.
insert(&*It).second) {
26254 if (HasNoUsers(&*It) &&
26255 VectorizeInsertsAndCmps(It->isTerminator())) {
26268 if (
P->getNumIncomingValues() == 2) {
26271 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26285 if (BB ==
P->getIncomingBlock(
I) ||
26286 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
26292 PI && !IsInPostProcessInstrs(PI)) {
26294 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26296 if (Res &&
R.isDeleted(
P)) {
26306 if (HasNoUsers(&*It)) {
26307 bool OpsChanged =
false;
26318 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26319 SI->getValueOperand()->hasOneUse();
26321 if (TryToVectorizeRoot) {
26322 for (
auto *V : It->operand_values()) {
26326 VI && !IsInPostProcessInstrs(VI))
26328 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26335 VectorizeInsertsAndCmps(It->isTerminator());
26347 PostProcessInserts.insert(&*It);
26355bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
26357 for (
auto &Entry : GEPs) {
26360 if (
Entry.second.size() < 2)
26363 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26364 <<
Entry.second.size() <<
".\n");
26372 return !R.isDeleted(GEP);
26374 if (It ==
Entry.second.end())
26376 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26377 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26378 if (MaxVecRegSize < EltSize)
26381 unsigned MaxElts = MaxVecRegSize / EltSize;
26382 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26383 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26396 Candidates.remove_if([&R](
Value *
I) {
26406 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
26407 auto *GEPI = GEPList[
I];
26408 if (!Candidates.count(GEPI))
26410 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
26411 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
26412 auto *GEPJ = GEPList[J];
26413 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26415 Candidates.remove(GEPI);
26416 Candidates.remove(GEPJ);
26417 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26418 Candidates.remove(GEPJ);
26425 if (Candidates.
size() < 2)
26431 SmallVector<Value *, 16> Bundle(Candidates.
size());
26432 auto BundleIndex = 0
u;
26433 for (
auto *V : Candidates) {
26435 auto *GEPIdx =
GEP->idx_begin()->get();
26437 Bundle[BundleIndex++] = GEPIdx;
26449 Changed |= tryToVectorizeList(Bundle, R);
26455bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26460 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
26461 if (
V->getValueOperand()->getType()->getTypeID() <
26464 if (
V->getValueOperand()->getType()->getTypeID() >
26467 if (
V->getPointerOperandType()->getTypeID() <
26468 V2->getPointerOperandType()->getTypeID())
26470 if (
V->getPointerOperandType()->getTypeID() >
26471 V2->getPointerOperandType()->getTypeID())
26473 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26476 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26482 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26483 DT->getNode(
I1->getParent());
26484 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26485 DT->getNode(I2->getParent());
26486 assert(NodeI1 &&
"Should only process reachable instructions");
26487 assert(NodeI2 &&
"Should only process reachable instructions");
26488 assert((NodeI1 == NodeI2) ==
26490 "Different nodes should have different DFS numbers");
26491 if (NodeI1 != NodeI2)
26493 return I1->getOpcode() < I2->getOpcode();
26495 return V->getValueOperand()->getValueID() <
26499 bool SameParent =
true;
26505 StoreInst *V2 = VL.
back();
26530 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
26532 for (
auto [SI, V] :
zip(VL, NewVL))
26533 V =
SI->getValueOperand();
26534 NewVL.back() = V1->getValueOperand();
26535 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
26536 InstructionsState S =
Analysis.buildInstructionsState(
26544 return V1->getValueOperand()->
getValueID() ==
26549 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26550 for (
auto &Pair : Stores) {
26551 if (Pair.second.size() < 2)
26555 << Pair.second.size() <<
".\n");
26564 Pair.second.rend());
26566 ReversedStores, StoreSorter, AreCompatibleStores,
26568 return vectorizeStores(Candidates, R, Attempted);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
~ShuffleInstructionBuilder()
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(unsigned CounterName)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const