74#ifdef EXPENSIVE_CHECKS
108using namespace std::placeholders;
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
113STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116 "Controls which SLP graphs should be vectorized.");
120 cl::desc(
"Run the SLP vectorization passes"));
124 cl::desc(
"Enable vectorization for wider vector utilization"));
128 cl::desc(
"Only vectorize if you gain more than this "
133 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
163 cl::desc(
"Limit the size of the SLP scheduling region per block"));
167 cl::desc(
"Attempt to vectorize for this register size in bits"));
171 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
175 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
181 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
190 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
194 cl::desc(
"The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
199 cl::desc(
"The maximum stride, considered to be profitable."));
203 cl::desc(
"Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
208 cl::desc(
"Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
213 cl::desc(
"Display the SLP trees with Graphviz"));
217 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
222 cl::desc(
"Try to replace values with the idempotent instructions for "
223 "better vectorization."));
255 Ty = Ty->getScalarType();
257 !Ty->isPPC_FP128Ty();
266 return SI->getValueOperand()->getType();
268 return CI->getOperand(0)->getType();
270 return IE->getOperand(1)->getType();
277 "ScalableVectorType is not supported.");
279 return VecTy->getNumElements();
293 Type *Ty,
unsigned Sz) {
298 if (NumParts == 0 || NumParts >= Sz)
313 if (NumParts == 0 || NumParts >= Sz)
318 return (Sz / RegVF) * RegVF;
330 I * VecTyNumElements, VecTyNumElements)))
332 : Mask[
I] * VecTyNumElements + J;
366 unsigned SVNumElements =
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
374 unsigned NumGroup = 0;
375 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
377 Value *Src = SV->getOperand(0);
383 if (SV->getOperand(0) != Src)
386 if (!SV->isExtractSubvectorMask(Index))
388 ExpectedIndex.
set(Index / ShuffleMaskSize);
392 if (!ExpectedIndex.
all())
396 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
415 unsigned SVNumElements =
418 unsigned AccumulateLength = 0;
419 for (
Value *V : VL) {
421 for (
int M : SV->getShuffleMask())
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
465 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
474 OS <<
"Idx: " << Idx <<
", ";
475 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
498 if (BB !=
II->getParent())
515 Value *FirstNonUndef =
nullptr;
516 for (
Value *V : VL) {
519 if (!FirstNonUndef) {
523 if (V != FirstNonUndef)
526 return FirstNonUndef !=
nullptr;
542 return Cmp->isCommutative();
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
564 (BO->getOpcode() == Instruction::FSub &&
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
570 return I->isCommutative();
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
593 return I->getNumOperands();
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
610 if (CI->getValue().uge(VT->getNumElements()))
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
635 Type *CurrentType =
IV->getType();
636 for (
unsigned I :
IV->indices()) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(
I);
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
664 return std::all_of(It, VL.
end(), [&](
Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
697 if (MaskArg == UseMask::UndefsAsMask)
701 if (MaskArg == UseMask::FirstArg &&
Value < VF)
702 UseMask.reset(
Value);
703 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
704 UseMask.reset(
Value - VF);
712template <
bool IsPoisonOnly = false>
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
724 if (!UseMask.empty()) {
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
750 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
751 if (
Constant *Elem =
C->getAggregateElement(
I))
753 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
781static std::optional<TargetTransformInfo::ShuffleKind>
788 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
795 return std::max(S, VTy->getNumElements());
798 Value *Vec1 =
nullptr;
799 Value *Vec2 =
nullptr;
804 Value *Vec = EE->getVectorOperand();
810 ShuffleMode CommonShuffleMode =
Unknown;
812 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
819 auto *Vec = EI->getVectorOperand();
833 if (Idx->getValue().uge(
Size))
835 unsigned IntIdx = Idx->getValue().getZExtValue();
842 if (!Vec1 || Vec1 == Vec) {
844 }
else if (!Vec2 || Vec2 == Vec) {
850 if (CommonShuffleMode == Permute)
854 if (Mask[
I] %
Size !=
I) {
855 CommonShuffleMode = Permute;
858 CommonShuffleMode =
Select;
861 if (CommonShuffleMode ==
Select && Vec2)
871 unsigned Opcode =
E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
879 return CI->getZExtValue();
882 if (EI->getNumIndices() != 1)
884 return *EI->idx_begin();
910bool isValidForAlternation(
unsigned Opcode) {
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
934 MainOpBIT = 0b100000000,
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(
const Instruction *
I) {
944 unsigned Opcode =
I->getOpcode();
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
957 struct InterchangeableInfo {
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(
const Instruction *I) : I(I) {}
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
979 bool equal(
unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1006 bool hasCandidateOpcode(
unsigned Opcode)
const {
1007 MaskType Candidate = Mask & SeenBefore;
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1058 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1061 : APInt::getZero(FromCIValueBitWidth);
1064 case Instruction::Mul:
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1069 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1072 : APInt::getZero(FromCIValueBitWidth);
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.
isZero()) {
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1086 case Instruction::And:
1088 ToCIValue = ToOpcode == Instruction::Mul
1090 : APInt::getZero(FromCIValueBitWidth);
1093 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1097 Value *
LHS = I->getOperand(1 - Pos);
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1103 (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
1108 InterchangeableInfo MainOp;
1109 InterchangeableInfo AltOp;
1110 bool isValidForAlternation(
const Instruction *
I)
const {
1111 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1112 ::isValidForAlternation(
I->getOpcode());
1114 bool initializeAltOp(
const Instruction *
I) {
1117 if (!isValidForAlternation(
I))
1124 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1125 const Instruction *AltOp =
nullptr)
1126 : MainOp(MainOp), AltOp(AltOp) {
1129 bool add(
const Instruction *
I) {
1131 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1132 unsigned Opcode =
I->getOpcode();
1133 MaskType OpcodeInMaskForm;
1136 case Instruction::Shl:
1137 OpcodeInMaskForm = ShlBIT;
1139 case Instruction::AShr:
1140 OpcodeInMaskForm = AShrBIT;
1142 case Instruction::Mul:
1143 OpcodeInMaskForm = MulBIT;
1145 case Instruction::Add:
1146 OpcodeInMaskForm = AddBIT;
1148 case Instruction::Sub:
1149 OpcodeInMaskForm = SubBIT;
1151 case Instruction::And:
1152 OpcodeInMaskForm = AndBIT;
1154 case Instruction::Or:
1155 OpcodeInMaskForm = OrBIT;
1157 case Instruction::Xor:
1158 OpcodeInMaskForm = XorBIT;
1161 return MainOp.equal(Opcode) ||
1162 (initializeAltOp(
I) && AltOp.equal(Opcode));
1164 MaskType InterchangeableMask = OpcodeInMaskForm;
1165 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1167 constexpr MaskType CanBeAll =
1168 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1169 const APInt &CIValue = CI->
getValue();
1171 case Instruction::Shl:
1173 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1175 case Instruction::Mul:
1176 if (CIValue.
isOne()) {
1177 InterchangeableMask = CanBeAll;
1181 InterchangeableMask = MulBIT | ShlBIT;
1183 case Instruction::Add:
1184 case Instruction::Sub:
1185 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1187 case Instruction::And:
1189 InterchangeableMask = CanBeAll;
1193 InterchangeableMask = CanBeAll;
1197 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1198 (initializeAltOp(
I) &&
1199 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1201 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1203 bool hasCandidateOpcode(
unsigned Opcode)
const {
1204 return MainOp.hasCandidateOpcode(Opcode);
1206 bool hasAltOp()
const {
return AltOp.I; }
1207 unsigned getAltOpcode()
const {
1208 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1211 return MainOp.getOperand(
I);
1216class InstructionsState {
1242 bool HasCopyables =
false;
1246 assert(valid() &&
"InstructionsState is invalid.");
1251 assert(valid() &&
"InstructionsState is invalid.");
1256 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1258 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1261 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1270 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1271 assert(MainOp &&
"MainOp cannot be nullptr.");
1272 if (
I->getOpcode() == MainOp->getOpcode())
1275 assert(AltOp &&
"AltOp cannot be nullptr.");
1276 if (
I->getOpcode() == AltOp->getOpcode())
1278 if (!
I->isBinaryOp())
1280 BinOpSameOpcodeHelper
Converter(MainOp);
1283 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1284 BinOpSameOpcodeHelper AltConverter(AltOp);
1285 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1286 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1289 if (
Converter.hasAltOp() && !isAltShuffle())
1291 return Converter.hasAltOp() ? AltOp : MainOp;
1295 bool isShiftOp()
const {
1296 return getMainOp()->isShift() && getAltOp()->isShift();
1301 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1305 bool isMulDivLikeOp()
const {
1306 constexpr std::array<unsigned, 8> MulDiv = {
1307 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1308 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1309 Instruction::URem, Instruction::FRem};
1315 bool isAddSubLikeOp()
const {
1316 constexpr std::array<unsigned, 4>
AddSub = {
1317 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1324 bool isCmpOp()
const {
1325 return (
getOpcode() == Instruction::ICmp ||
1331 bool valid()
const {
return MainOp && AltOp; }
1333 explicit operator bool()
const {
return valid(); }
1335 InstructionsState() =
delete;
1336 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1337 bool HasCopyables =
false)
1338 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1339 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1342 bool isCopyableElement(
Value *V)
const {
1343 assert(valid() &&
"InstructionsState is invalid.");
1346 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1351 if (
I->getParent() != MainOp->getParent() &&
1355 if (
I->getOpcode() == MainOp->getOpcode())
1357 if (!
I->isBinaryOp())
1359 BinOpSameOpcodeHelper
Converter(MainOp);
1365 bool isNonSchedulable(
Value *V)
const {
1366 assert(valid() &&
"InstructionsState is invalid.");
1373 if (getMainOp() == V)
1375 if (isCopyableElement(V)) {
1376 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1378 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1383 !MainOp->comesBefore(
I));
1386 return IsNonSchedulableCopyableElement(V);
1393 bool areInstructionsWithCopyableElements()
const {
1394 assert(valid() &&
"InstructionsState is invalid.");
1395 return HasCopyables;
1399std::pair<Instruction *, SmallVector<Value *>>
1401 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1402 assert(SelectedOp &&
"Cannot convert the instruction.");
1403 if (
I->isBinaryOp()) {
1405 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1424 for (
Value *V : VL) {
1429 if (Inst->getOpcode() == Opcode)
1443 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1454 "Assessing comparisons of different types?");
1464 return (BasePred == Pred &&
1466 (BasePred == SwappedPred &&
1477 return InstructionsState::invalid();
1481 return InstructionsState::invalid();
1486 (VL.
size() == 2 && InstCnt < 2))
1487 return InstructionsState::invalid();
1496 unsigned AltOpcode = Opcode;
1498 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1499 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1501 UniquePreds.
insert(BasePred);
1502 UniqueNonSwappedPreds.
insert(BasePred);
1503 for (
Value *V : VL) {
1510 UniqueNonSwappedPreds.
insert(CurrentPred);
1511 if (!UniquePreds.
contains(CurrentPred) &&
1512 !UniquePreds.
contains(SwappedCurrentPred))
1513 UniquePreds.
insert(CurrentPred);
1518 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1528 return InstructionsState::invalid();
1530 bool AnyPoison = InstCnt != VL.
size();
1541 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1542 return InstructionsState::invalid();
1543 unsigned InstOpcode =
I->getOpcode();
1545 if (BinOpHelper.add(
I))
1550 Value *Op1 =
I->getOperand(0);
1553 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1555 if (Opcode == AltOpcode) {
1556 assert(isValidForAlternation(Opcode) &&
1557 isValidForAlternation(InstOpcode) &&
1558 "Cast isn't safe for alternation, logic needs to be updated!");
1559 AltOpcode = InstOpcode;
1566 Type *Ty0 = BaseInst->getOperand(0)->getType();
1567 Type *Ty1 = Inst->getOperand(0)->getType();
1569 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1570 assert(InstOpcode == AltOpcode &&
1571 "Alternate instructions are only supported by BinaryOperator "
1579 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1580 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1586 if (MainOp != AltOp) {
1589 }
else if (BasePred != CurrentPred) {
1591 isValidForAlternation(InstOpcode) &&
1592 "CmpInst isn't safe for alternation, logic needs to be updated!");
1597 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1598 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1601 }
else if (InstOpcode == Opcode) {
1602 assert(InstOpcode == AltOpcode &&
1603 "Alternate instructions are only supported by BinaryOperator and "
1606 if (Gep->getNumOperands() != 2 ||
1608 return InstructionsState::invalid();
1611 return InstructionsState::invalid();
1614 if (!LI->isSimple() || !BaseLI->isSimple())
1615 return InstructionsState::invalid();
1619 return InstructionsState::invalid();
1620 if (
Call->hasOperandBundles() &&
1622 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1623 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1626 return InstructionsState::invalid();
1629 return InstructionsState::invalid();
1632 if (Mappings.
size() != BaseMappings.
size() ||
1633 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1634 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1635 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1636 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1637 Mappings.
front().Shape.Parameters !=
1638 BaseMappings.
front().Shape.Parameters)
1639 return InstructionsState::invalid();
1644 return InstructionsState::invalid();
1649 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1651 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1654 "Incorrect implementation of allSameOpcode.");
1655 InstructionsState S(MainOp, AltOp);
1661 "Invalid InstructionsState.");
1669 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1679 unsigned Opcode = UserInst->
getOpcode();
1681 case Instruction::Load: {
1685 case Instruction::Store: {
1687 return (
SI->getPointerOperand() == Scalar);
1689 case Instruction::Call: {
1693 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1694 Arg.value().get() == Scalar;
1714 return LI->isSimple();
1716 return SI->isSimple();
1718 return !
MI->isVolatile();
1726 bool ExtendingManyInputs =
false) {
1727 if (SubMask.
empty())
1730 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1733 "SubMask with many inputs support must be larger than the mask.");
1735 Mask.append(SubMask.
begin(), SubMask.
end());
1739 int TermValue = std::min(Mask.size(), SubMask.
size());
1740 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1742 (!ExtendingManyInputs &&
1743 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1745 NewMask[
I] = Mask[SubMask[
I]];
1761 const size_t Sz = Order.
size();
1764 for (
unsigned I = 0;
I < Sz; ++
I) {
1766 UnusedIndices.
reset(Order[
I]);
1768 MaskedIndices.
set(
I);
1770 if (MaskedIndices.
none())
1773 "Non-synced masked/available indices.");
1777 assert(Idx >= 0 &&
"Indices must be synced.");
1787 unsigned Opcode0,
unsigned Opcode1) {
1794 OpcodeMask.
set(Lane * ScalarTyNumElements,
1795 Lane * ScalarTyNumElements + ScalarTyNumElements);
1804 "Expected scalar constants.");
1807 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1816 const unsigned E = Indices.
size();
1818 for (
unsigned I = 0;
I < E; ++
I)
1819 Mask[Indices[
I]] =
I;
1825 assert(!Mask.empty() &&
"Expected non-empty mask.");
1829 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1831 Scalars[Mask[
I]] = Prev[
I];
1844 auto *IO = dyn_cast<Instruction>(V);
1847 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1860 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1862 auto *IU = dyn_cast<Instruction>(U);
1865 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1881 return !VL.
empty() &&
1897 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1906 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1907 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1908 if (NumParts == 0 || NumParts >= Limit)
1911 if (NumParts >= Sz || Sz % NumParts != 0 ||
1922 class ScheduleEntity;
1924 class ScheduleCopyableData;
1925 class ScheduleBundle;
1935 struct StridedPtrInfo {
1936 Value *StrideVal =
nullptr;
1937 const SCEV *StrideSCEV =
nullptr;
1963 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1964 AC(AC), DB(DB), DL(DL), ORE(ORE),
1983 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1996 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2017 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2018 return VectorizableTree.front()->Scalars;
2024 const TreeEntry &Root = *VectorizableTree.front();
2025 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2026 !Root.Scalars.
front()->getType()->isIntegerTy())
2027 return std::nullopt;
2028 auto It = MinBWs.find(&Root);
2029 if (It != MinBWs.end())
2033 if (Root.getOpcode() == Instruction::ZExt ||
2034 Root.getOpcode() == Instruction::SExt)
2035 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2036 Root.getOpcode() == Instruction::SExt);
2037 return std::nullopt;
2043 return MinBWs.at(VectorizableTree.front().get()).second;
2048 if (ReductionBitWidth == 0 ||
2049 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2050 ReductionBitWidth >=
2051 DL->getTypeSizeInBits(
2052 VectorizableTree.front()->Scalars.front()->getType()))
2054 VectorizableTree.front()->Scalars.front()->getType(),
2055 VectorizableTree.front()->getVectorFactor());
2058 VectorizableTree.front()->Scalars.front()->getContext(),
2060 VectorizableTree.front()->getVectorFactor());
2075 VectorizableTree.clear();
2076 ScalarToTreeEntries.clear();
2077 OperandsToTreeEntry.clear();
2078 ScalarsInSplitNodes.clear();
2080 NonScheduledFirst.clear();
2081 EntryToLastInstruction.clear();
2082 LoadEntriesToVectorize.clear();
2083 IsGraphTransformMode =
false;
2084 GatheredLoadsEntriesFirst.reset();
2085 CompressEntryToData.clear();
2086 ExternalUses.clear();
2087 ExternalUsesAsOriginalScalar.clear();
2088 ExternalUsesWithNonUsers.clear();
2089 for (
auto &Iter : BlocksSchedules) {
2090 BlockScheduling *BS = Iter.second.get();
2094 ReductionBitWidth = 0;
2096 CastMaxMinBWSizes.reset();
2097 ExtraBitWidthNodes.clear();
2098 InstrElementSize.clear();
2099 UserIgnoreList =
nullptr;
2100 PostponedGathers.clear();
2101 ValueToGatherNodes.clear();
2117 assert(!Order.
empty() &&
"expected non-empty order");
2118 const unsigned Sz = Order.
size();
2120 return P.value() ==
P.index() ||
P.value() == Sz;
2133 bool IgnoreReorder);
2146 std::optional<OrdersType>
2184 return MaxVecRegSize;
2189 return MinVecRegSize;
2197 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2198 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2199 return MaxVF ? MaxVF : UINT_MAX;
2240 const int64_t Diff, StridedPtrInfo &SPtrInfo)
const;
2255 StridedPtrInfo &SPtrInfo,
2256 unsigned *BestVF =
nullptr,
2257 bool TryRecursiveCheck =
true)
const;
2261 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2265 template <
typename T>
2267 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2292 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2293 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2318 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2319 MaxLevel(MaxLevel) {}
2375 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2380 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2382 return U == U1 || U == U2 || R.isVectorized(U);
2385 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2388 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2390 ((
int)V1->getNumUses() == NumLanes ||
2391 AllUsersAreInternal(V1, V2)))
2397 auto CheckSameEntryOrFail = [&]() {
2402 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2411 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2413 return CheckSameEntryOrFail();
2416 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2417 LI2->getPointerOperand(), DL, SE,
true);
2418 if (!Dist || *Dist == 0) {
2421 R.TTI->isLegalMaskedGather(
2424 return CheckSameEntryOrFail();
2428 if (std::abs(*Dist) > NumLanes / 2)
2461 Value *EV2 =
nullptr;
2474 int Dist = Idx2 - Idx1;
2477 if (std::abs(Dist) == 0)
2479 if (std::abs(Dist) > NumLanes / 2)
2486 return CheckSameEntryOrFail();
2492 if (I1->getParent() != I2->getParent())
2493 return CheckSameEntryOrFail();
2501 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2502 !S.isAltShuffle()) &&
2506 S.getMainOp()->getNumOperands();
2518 return CheckSameEntryOrFail();
2552 int ShallowScoreAtThisLevel =
2563 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2566 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2568 ShallowScoreAtThisLevel))
2569 return ShallowScoreAtThisLevel;
2570 assert(I1 && I2 &&
"Should have early exited.");
2577 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2578 OpIdx1 != NumOperands1; ++OpIdx1) {
2580 int MaxTmpScore = 0;
2581 unsigned MaxOpIdx2 = 0;
2582 bool FoundBest =
false;
2586 ? I2->getNumOperands()
2587 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2588 assert(FromIdx <= ToIdx &&
"Bad index");
2589 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2591 if (Op2Used.
count(OpIdx2))
2596 I1, I2, CurrLevel + 1, {});
2599 TmpScore > MaxTmpScore) {
2600 MaxTmpScore = TmpScore;
2607 Op2Used.
insert(MaxOpIdx2);
2608 ShallowScoreAtThisLevel += MaxTmpScore;
2611 return ShallowScoreAtThisLevel;
2642 struct OperandData {
2643 OperandData() =
default;
2644 OperandData(
Value *V,
bool APO,
bool IsUsed)
2645 : V(V), APO(APO), IsUsed(IsUsed) {}
2655 bool IsUsed =
false;
2664 enum class ReorderingMode {
2678 unsigned ArgSize = 0;
2684 const Loop *L =
nullptr;
2687 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2688 return OpsVec[
OpIdx][Lane];
2692 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2693 return OpsVec[
OpIdx][Lane];
2698 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2700 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2702 OpsVec[
OpIdx][Lane].IsUsed =
false;
2706 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2707 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2719 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2721 Value *IdxLaneV = getData(Idx, Lane).V;
2734 unsigned UniquesCount = Uniques.
size();
2735 auto IdxIt = Uniques.
find(IdxLaneV);
2736 unsigned UniquesCntWithIdxLaneV =
2737 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2739 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2740 unsigned UniquesCntWithOpIdxLaneV =
2741 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2742 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2744 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2745 UniquesCntWithOpIdxLaneV,
2746 UniquesCntWithOpIdxLaneV -
2748 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2749 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2750 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2759 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2760 Value *IdxLaneV = getData(Idx, Lane).V;
2773 return R.areAllUsersVectorized(IdxLaneI)
2781 static const int ScoreScaleFactor = 10;
2789 int Lane,
unsigned OpIdx,
unsigned Idx,
2799 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2800 if (Score <= -SplatScore) {
2804 Score += SplatScore;
2810 Score *= ScoreScaleFactor;
2811 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2829 std::optional<unsigned>
2830 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2834 unsigned NumOperands = getNumOperands();
2837 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2840 ReorderingMode RMode = ReorderingModes[
OpIdx];
2841 if (RMode == ReorderingMode::Failed)
2842 return std::nullopt;
2845 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2851 std::optional<unsigned> Idx;
2855 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2861 bool IsUsed = RMode == ReorderingMode::Splat ||
2862 RMode == ReorderingMode::Constant ||
2863 RMode == ReorderingMode::Load;
2865 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2867 OperandData &OpData = getData(Idx, Lane);
2869 bool OpAPO = OpData.APO;
2878 if (OpAPO != OpIdxAPO)
2883 case ReorderingMode::Load:
2884 case ReorderingMode::Opcode: {
2885 bool LeftToRight = Lane > LastLane;
2886 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2887 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2888 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2889 OpIdx, Idx, IsUsed, UsedLanes);
2890 if (Score >
static_cast<int>(BestOp.Score) ||
2891 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2894 BestOp.Score = Score;
2895 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2899 case ReorderingMode::Constant:
2901 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2905 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2912 case ReorderingMode::Splat:
2914 IsUsed =
Op == OpLastLane;
2915 if (
Op == OpLastLane) {
2917 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2923 case ReorderingMode::Failed:
2929 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2933 return std::nullopt;
2940 unsigned getBestLaneToStartReordering()
const {
2941 unsigned Min = UINT_MAX;
2942 unsigned SameOpNumber = 0;
2953 for (
int I = getNumLanes();
I > 0; --
I) {
2954 unsigned Lane =
I - 1;
2955 OperandsOrderData NumFreeOpsHash =
2956 getMaxNumOperandsThatCanBeReordered(Lane);
2959 if (NumFreeOpsHash.NumOfAPOs < Min) {
2960 Min = NumFreeOpsHash.NumOfAPOs;
2961 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2963 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2964 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2965 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2968 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2969 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2970 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2971 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2972 auto [It, Inserted] =
2973 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2979 unsigned BestLane = 0;
2980 unsigned CntMin = UINT_MAX;
2982 if (
Data.second.first < CntMin) {
2983 CntMin =
Data.second.first;
2984 BestLane =
Data.second.second;
2991 struct OperandsOrderData {
2994 unsigned NumOfAPOs = UINT_MAX;
2997 unsigned NumOpsWithSameOpcodeParent = 0;
3011 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3012 unsigned CntTrue = 0;
3013 unsigned NumOperands = getNumOperands();
3023 bool AllUndefs =
true;
3024 unsigned NumOpsWithSameOpcodeParent = 0;
3029 const OperandData &OpData = getData(
OpIdx, Lane);
3036 I->getParent() != Parent) {
3037 if (NumOpsWithSameOpcodeParent == 0) {
3038 NumOpsWithSameOpcodeParent = 1;
3040 Parent =
I->getParent();
3042 --NumOpsWithSameOpcodeParent;
3045 ++NumOpsWithSameOpcodeParent;
3054 OperandsOrderData
Data;
3055 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3056 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3063 const InstructionsState &S) {
3067 return VL.
size() == getNumLanes();
3069 "Expected same number of lanes");
3070 assert(S.valid() &&
"InstructionsState is invalid.");
3076 OpsVec.resize(ArgSize);
3077 unsigned NumLanes = VL.
size();
3078 for (OperandDataVec &
Ops : OpsVec)
3079 Ops.resize(NumLanes);
3097 bool IsInverseOperation =
false;
3098 if (S.isCopyableElement(VL[Lane])) {
3102 assert(
I &&
"Expected instruction");
3103 auto [SelectedOp,
Ops] = convertTo(
I, S);
3110 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3117 unsigned getNumOperands()
const {
return ArgSize; }
3120 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3123 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3124 return getData(
OpIdx, Lane).V;
3128 bool empty()
const {
return OpsVec.empty(); }
3131 void clear() { OpsVec.clear(); }
3136 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3138 "Op is expected to be getValue(OpIdx, Lane).");
3142 bool OpAPO = getData(
OpIdx, Lane).APO;
3143 bool IsInvariant = L && L->isLoopInvariant(
Op);
3145 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3149 bool FoundCandidate =
false;
3150 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3151 OperandData &
Data = getData(OpI, Ln);
3152 if (
Data.APO != OpAPO ||
Data.IsUsed)
3154 Value *OpILane = getValue(OpI, Lane);
3178 L->isLoopInvariant(
Data.V))) {
3179 FoundCandidate =
true;
3186 if (!FoundCandidate)
3189 return getNumLanes() == 2 || Cnt > 1;
3196 "Op is expected to be getValue(OpIdx, Lane).");
3197 bool OpAPO = getData(
OpIdx, Lane).APO;
3198 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3202 const OperandData &
Data = getData(OpI, Ln);
3203 if (
Data.APO != OpAPO ||
Data.IsUsed)
3205 Value *OpILn = getValue(OpI, Ln);
3206 return (L && L->isLoopInvariant(OpILn)) ||
3218 const InstructionsState &S,
const BoUpSLP &R)
3219 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3220 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3222 appendOperands(RootVL,
Operands, S);
3230 "Expected same num of lanes across all operands");
3231 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3232 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3240 unsigned NumOperands = getNumOperands();
3241 unsigned NumLanes = getNumLanes();
3261 unsigned FirstLane = getBestLaneToStartReordering();
3270 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3271 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3272 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3274 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3276 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3278 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3281 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3291 auto &&SkipReordering = [
this]() {
3294 for (
const OperandData &
Data : Op0)
3297 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3298 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3305 return UniqueValues.
size() != 2 &&
3307 UniqueValues.
size());
3319 if (SkipReordering())
3322 bool StrategyFailed =
false;
3330 for (
unsigned I = 0;
I < NumOperands; ++
I)
3331 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3334 UsedLanes.
set(FirstLane);
3335 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3337 for (
int Direction : {+1, -1}) {
3338 int Lane = FirstLane + Direction * Distance;
3339 if (Lane < 0 || Lane >= (
int)NumLanes)
3341 UsedLanes.
set(Lane);
3342 int LastLane = Lane - Direction;
3343 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3348 std::optional<unsigned> BestIdx =
3349 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3350 MainAltOps[
OpIdx], UsedLanes);
3357 swap(
OpIdx, *BestIdx, Lane);
3360 StrategyFailed =
true;
3364 OperandData &AltOp = getData(
OpIdx, Lane);
3365 InstructionsState OpS =
3367 if (OpS && OpS.isAltShuffle())
3374 if (!StrategyFailed)
3379#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3382 case ReorderingMode::Load:
3384 case ReorderingMode::Opcode:
3386 case ReorderingMode::Constant:
3388 case ReorderingMode::Splat:
3390 case ReorderingMode::Failed:
3411 const unsigned Indent = 2;
3413 for (
const OperandDataVec &OpDataVec : OpsVec) {
3414 OS <<
"Operand " << Cnt++ <<
"\n";
3415 for (
const OperandData &OpData : OpDataVec) {
3416 OS.
indent(Indent) <<
"{";
3417 if (
Value *V = OpData.V)
3421 OS <<
", APO:" << OpData.APO <<
"}\n";
3443 int BestScore = Limit;
3444 std::optional<int> Index;
3445 for (
int I :
seq<int>(0, Candidates.size())) {
3447 Candidates[
I].second,
3450 if (Score > BestScore) {
3465 DeletedInstructions.insert(
I);
3470 template <
typename T>
3473 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3475 for (
T *V : DeadVals) {
3480 for (
T *V : DeadVals) {
3481 if (!V || !Processed.
insert(V).second)
3486 for (
Use &U :
I->operands()) {
3488 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3490 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3491 return Entry->VectorizedValue == OpI;
3495 I->dropAllReferences();
3497 for (
T *V : DeadVals) {
3499 if (!
I->getParent())
3504 cast<Instruction>(U.getUser()));
3506 "trying to erase instruction with users.");
3507 I->removeFromParent();
3511 while (!DeadInsts.
empty()) {
3514 if (!VI || !VI->getParent())
3517 "Live instruction found in dead worklist!");
3518 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3525 for (
Use &OpU : VI->operands()) {
3526 Value *OpV = OpU.get();
3538 if (!DeletedInstructions.contains(OpI) &&
3539 (!OpI->getType()->isVectorTy() ||
3540 none_of(VectorValuesAndScales,
3541 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3542 return std::get<0>(V) == OpI;
3548 VI->removeFromParent();
3550 SE->forgetValue(VI);
3557 return AnalyzedReductionsRoots.count(
I);
3562 AnalyzedReductionsRoots.insert(
I);
3567 return AnalyzedReductionVals.contains(
hash_value(VL));
3572 AnalyzedReductionVals.insert(
hash_value(VL));
3576 AnalyzedReductionsRoots.clear();
3577 AnalyzedReductionVals.clear();
3578 AnalyzedMinBWVals.clear();
3586 return MustGather.contains(V);
3590 return NonScheduledFirst.contains(V);
3595 assert(V &&
"V cannot be nullptr.");
3596 return ScalarToTreeEntries.contains(V);
3606 bool collectValuesToDemote(
3607 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3610 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3619 void buildReorderableOperands(
3627 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3630 bool areAllUsersVectorized(
3639 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3640 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3641 return const_cast<TreeEntry *
>(
3642 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3648 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3652 getCastContextHint(
const TreeEntry &TE)
const;
3666 const InstructionsState &LocalState,
3673 unsigned InterleaveFactor = 0);
3684 bool ResizeAllowed =
false)
const;
3691 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx);
3696 template <
typename BVTy,
typename ResTy,
typename... Args>
3697 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3702 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
3708 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3715 std::optional<TargetTransformInfo::ShuffleKind>
3727 unsigned NumParts)
const;
3739 std::optional<TargetTransformInfo::ShuffleKind>
3740 isGatherShuffledSingleRegisterEntry(
3757 isGatherShuffledEntry(
3760 unsigned NumParts,
bool ForOrder =
false);
3766 Type *ScalarTy)
const;
3770 void setInsertPointAfterBundle(
const TreeEntry *E);
3780 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3785 void tryToVectorizeGatheredLoads(
3787 std::tuple<BasicBlock *, Value *, Type *>,
3795 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3811 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3815 void reorderGatherNode(TreeEntry &TE);
3820 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3824 if (State == TreeEntry::SplitVectorize)
3826 SmallVector<int>
Mask;
3833 SmallVector<int> getSplitMask()
const {
3834 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3835 "Expected only split vectorize node.");
3837 unsigned CommonVF = std::max<unsigned>(
3838 CombinedEntriesWithIndices.back().second,
3839 Scalars.size() - CombinedEntriesWithIndices.back().second);
3840 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3842 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3843 ? CommonVF - CombinedEntriesWithIndices.back().second
3850 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3851 ArrayRef<int> MaskOrder);
3856 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3857 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3860 [Scalars](
Value *V,
int Idx) {
3861 return (isa<UndefValue>(V) &&
3862 Idx == PoisonMaskElem) ||
3863 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3866 if (!ReorderIndices.empty()) {
3870 SmallVector<int>
Mask;
3872 if (VL.
size() == Scalars.size())
3873 return IsSame(Scalars, Mask);
3874 if (VL.
size() == ReuseShuffleIndices.size()) {
3876 return IsSame(Scalars, Mask);
3880 return IsSame(Scalars, ReuseShuffleIndices);
3884 bool hasEqualOperands(
const TreeEntry &TE)
const {
3885 if (
TE.getNumOperands() != getNumOperands())
3887 SmallBitVector
Used(getNumOperands());
3888 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3889 unsigned PrevCount =
Used.count();
3890 for (
unsigned K = 0;
K <
E; ++
K) {
3893 if (getOperand(K) ==
TE.getOperand(
I)) {
3899 if (PrevCount ==
Used.count())
3908 unsigned getVectorFactor()
const {
3909 if (!ReuseShuffleIndices.empty())
3910 return ReuseShuffleIndices.size();
3911 return Scalars.size();
3915 bool isGather()
const {
return State == NeedToGather; }
3921 WeakTrackingVH VectorizedValue =
nullptr;
3942 enum CombinedOpcode {
3944 MinMax = Instruction::OtherOpsEnd + 1,
3947 CombinedOpcode CombinedOp = NotCombinedOp;
3950 SmallVector<int, 4> ReuseShuffleIndices;
3953 SmallVector<unsigned, 4> ReorderIndices;
3961 VecTreeTy &Container;
3964 EdgeInfo UserTreeIndex;
3980 SmallPtrSet<const Value *, 4> CopyableElements;
3984 InstructionsState S = InstructionsState::invalid();
3987 unsigned InterleaveFactor = 0;
3990 bool DoesNotNeedToSchedule =
false;
3994 if (Operands.size() <
OpIdx + 1)
3995 Operands.resize(
OpIdx + 1);
3998 "Number of operands is greater than the number of scalars.");
4005 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4007 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4010 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4013 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4018 setOperand(
I, Operands[
I]);
4022 void reorderOperands(ArrayRef<int> Mask) {
4030 return Operands[
OpIdx];
4036 return Operands[
OpIdx];
4040 unsigned getNumOperands()
const {
return Operands.size(); }
4043 Value *getSingleOperand(
unsigned OpIdx)
const {
4046 return Operands[
OpIdx][0];
4050 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4052 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4053 return S.getMatchingMainOpOrAltOp(
I);
4061 if (
I && getMatchingMainOpOrAltOp(
I))
4063 return S.getMainOp();
4066 void setOperations(
const InstructionsState &S) {
4067 assert(S &&
"InstructionsState is invalid.");
4071 Instruction *getMainOp()
const {
return S.getMainOp(); }
4073 Instruction *getAltOp()
const {
return S.getAltOp(); }
4076 unsigned getOpcode()
const {
return S.getOpcode(); }
4078 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4080 bool hasState()
const {
return S.valid(); }
4083 void addCopyableElement(
Value *V) {
4084 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4085 CopyableElements.insert(V);
4089 bool isCopyableElement(
Value *V)
const {
4090 return CopyableElements.contains(V);
4094 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4097 const InstructionsState &getOperations()
const {
return S; }
4101 unsigned findLaneForValue(
Value *V)
const {
4102 unsigned FoundLane = getVectorFactor();
4103 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4104 std::advance(It, 1)) {
4107 FoundLane = std::distance(Scalars.begin(), It);
4108 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4109 if (!ReorderIndices.empty())
4110 FoundLane = ReorderIndices[FoundLane];
4111 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4112 if (ReuseShuffleIndices.empty())
4114 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4115 RIt != ReuseShuffleIndices.end()) {
4116 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4120 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4127 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4128 SmallVectorImpl<int> &Mask,
4129 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4130 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4133 bool isNonPowOf2Vec()
const {
4135 return IsNonPowerOf2;
4141 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4144 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4145 "Reshuffling not supported with non-power-of-2 vectors yet.");
4146 return IsNonPowerOf2;
4149 Value *getOrdered(
unsigned Idx)
const {
4150 assert(isGather() &&
"Must be used only for buildvectors/gathers.");
4151 if (ReorderIndices.empty())
4152 return Scalars[Idx];
4153 SmallVector<int>
Mask;
4155 return Scalars[
Mask[Idx]];
4161 dbgs() << Idx <<
".\n";
4162 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4163 dbgs() <<
"Operand " << OpI <<
":\n";
4164 for (
const Value *V : Operands[OpI])
4167 dbgs() <<
"Scalars: \n";
4168 for (
Value *V : Scalars)
4170 dbgs() <<
"State: ";
4171 if (S && hasCopyableElements())
4172 dbgs() <<
"[[Copyable]] ";
4175 if (InterleaveFactor > 0) {
4176 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4179 dbgs() <<
"Vectorize\n";
4182 case ScatterVectorize:
4183 dbgs() <<
"ScatterVectorize\n";
4185 case StridedVectorize:
4186 dbgs() <<
"StridedVectorize\n";
4188 case CompressVectorize:
4189 dbgs() <<
"CompressVectorize\n";
4192 dbgs() <<
"NeedToGather\n";
4194 case CombinedVectorize:
4195 dbgs() <<
"CombinedVectorize\n";
4197 case SplitVectorize:
4198 dbgs() <<
"SplitVectorize\n";
4202 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4203 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4205 dbgs() <<
"MainOp: NULL\n";
4206 dbgs() <<
"AltOp: NULL\n";
4208 dbgs() <<
"VectorizedValue: ";
4209 if (VectorizedValue)
4210 dbgs() << *VectorizedValue <<
"\n";
4213 dbgs() <<
"ReuseShuffleIndices: ";
4214 if (ReuseShuffleIndices.empty())
4217 for (
int ReuseIdx : ReuseShuffleIndices)
4218 dbgs() << ReuseIdx <<
", ";
4220 dbgs() <<
"ReorderIndices: ";
4221 for (
unsigned ReorderIdx : ReorderIndices)
4222 dbgs() << ReorderIdx <<
", ";
4224 dbgs() <<
"UserTreeIndex: ";
4226 dbgs() << UserTreeIndex;
4228 dbgs() <<
"<invalid>";
4230 if (!CombinedEntriesWithIndices.empty()) {
4231 dbgs() <<
"Combined entries: ";
4233 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4244 StringRef Banner)
const {
4245 dbgs() <<
"SLP: " << Banner <<
":\n";
4247 dbgs() <<
"SLP: Costs:\n";
4248 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4249 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4250 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4251 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4252 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4258 const InstructionsState &S,
4260 ArrayRef<int> ReuseShuffleIndices = {}) {
4261 auto Invalid = ScheduleBundle::invalid();
4262 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4267 const InstructionsState &S,
4269 ArrayRef<int> ReuseShuffleIndices = {},
4270 ArrayRef<unsigned> ReorderIndices = {},
4271 unsigned InterleaveFactor = 0) {
4272 TreeEntry::EntryState EntryState =
4273 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4274 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4275 ReuseShuffleIndices, ReorderIndices);
4276 if (
E && InterleaveFactor > 0)
4277 E->setInterleave(InterleaveFactor);
4282 TreeEntry::EntryState EntryState,
4283 ScheduleBundle &Bundle,
const InstructionsState &S,
4285 ArrayRef<int> ReuseShuffleIndices = {},
4286 ArrayRef<unsigned> ReorderIndices = {}) {
4287 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4288 EntryState == TreeEntry::SplitVectorize)) ||
4289 (Bundle && EntryState != TreeEntry::NeedToGather &&
4290 EntryState != TreeEntry::SplitVectorize)) &&
4291 "Need to vectorize gather entry?");
4293 if (GatheredLoadsEntriesFirst.has_value() &&
4294 EntryState == TreeEntry::NeedToGather && S &&
4295 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4296 !UserTreeIdx.UserTE)
4298 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4299 TreeEntry *
Last = VectorizableTree.back().get();
4300 Last->Idx = VectorizableTree.size() - 1;
4301 Last->State = EntryState;
4302 if (UserTreeIdx.UserTE)
4303 OperandsToTreeEntry.try_emplace(
4304 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4309 ReuseShuffleIndices.empty()) &&
4310 "Reshuffling scalars not yet supported for nodes with padding");
4311 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4312 ReuseShuffleIndices.end());
4313 if (ReorderIndices.
empty()) {
4316 Last->setOperations(S);
4319 Last->Scalars.assign(VL.
size(),
nullptr);
4321 [VL](
unsigned Idx) ->
Value * {
4322 if (Idx >= VL.size())
4323 return UndefValue::get(VL.front()->getType());
4328 Last->setOperations(S);
4329 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4331 if (EntryState == TreeEntry::SplitVectorize) {
4332 assert(S &&
"Split nodes must have operations.");
4333 Last->setOperations(S);
4334 SmallPtrSet<Value *, 4> Processed;
4335 for (
Value *V : VL) {
4339 auto It = ScalarsInSplitNodes.find(V);
4340 if (It == ScalarsInSplitNodes.end()) {
4341 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4342 (void)Processed.
insert(V);
4343 }
else if (Processed.
insert(V).second) {
4345 "Value already associated with the node.");
4346 It->getSecond().push_back(
Last);
4349 }
else if (!
Last->isGather()) {
4352 (!S.areInstructionsWithCopyableElements() &&
4354 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4355 Last->setDoesNotNeedToSchedule();
4356 SmallPtrSet<Value *, 4> Processed;
4357 for (
Value *V : VL) {
4360 if (S.isCopyableElement(V)) {
4361 Last->addCopyableElement(V);
4364 auto It = ScalarToTreeEntries.find(V);
4365 if (It == ScalarToTreeEntries.end()) {
4366 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4367 (void)Processed.
insert(V);
4368 }
else if (Processed.
insert(V).second) {
4370 "Value already associated with the node.");
4371 It->getSecond().push_back(
Last);
4375 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4376 "Bundle and VL out of sync");
4377 if (!Bundle.getBundle().empty()) {
4378#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4379 auto *BundleMember = Bundle.getBundle().begin();
4380 SmallPtrSet<Value *, 4> Processed;
4381 for (
Value *V : VL) {
4382 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4386 assert(BundleMember == Bundle.getBundle().end() &&
4387 "Bundle and VL out of sync");
4389 Bundle.setTreeEntry(
Last);
4393 bool AllConstsOrCasts =
true;
4394 for (
Value *V : VL) {
4395 if (S && S.areInstructionsWithCopyableElements() &&
4396 S.isCopyableElement(V))
4397 Last->addCopyableElement(V);
4400 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4401 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4402 !UserTreeIdx.UserTE->isGather())
4403 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4406 if (AllConstsOrCasts)
4408 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4409 MustGather.insert_range(VL);
4412 if (UserTreeIdx.UserTE)
4413 Last->UserTreeIndex = UserTreeIdx;
4419 TreeEntry::VecTreeTy VectorizableTree;
4424 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4425 VectorizableTree[
Id]->dump();
4433 assert(V &&
"V cannot be nullptr.");
4434 auto It = ScalarToTreeEntries.find(V);
4435 if (It == ScalarToTreeEntries.end())
4437 return It->getSecond();
4442 assert(V &&
"V cannot be nullptr.");
4443 auto It = ScalarsInSplitNodes.find(V);
4444 if (It == ScalarsInSplitNodes.end())
4446 return It->getSecond();
4451 bool SameVF =
false)
const {
4452 assert(V &&
"V cannot be nullptr.");
4453 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4454 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4465 bool areAltOperandsProfitable(
const InstructionsState &S,
4470 class ScalarsVectorizationLegality {
4471 InstructionsState S;
4473 bool TryToFindDuplicates;
4474 bool TrySplitVectorize;
4477 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4478 bool TryToFindDuplicates =
true,
4479 bool TrySplitVectorize =
false)
4480 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4481 TrySplitVectorize(TrySplitVectorize) {
4482 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4483 "Inconsistent state");
4485 const InstructionsState &getInstructionsState()
const {
return S; };
4486 bool isLegal()
const {
return IsLegal; }
4487 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4488 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4493 ScalarsVectorizationLegality
4496 bool TryCopyableElementsVectorization)
const;
4500 TreeEntry::EntryState getScalarsVectorizationState(
4502 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4503 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4506 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4509 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4510 OperandsToTreeEntry;
4513 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4516 SmallDenseMap<Value *, unsigned> InstrElementSize;
4530 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4535 SetVector<const TreeEntry *> PostponedGathers;
4537 using ValueToGatherNodesMap =
4538 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4539 ValueToGatherNodesMap ValueToGatherNodes;
4544 SetVector<unsigned> LoadEntriesToVectorize;
4547 bool IsGraphTransformMode =
false;
4550 std::optional<unsigned> GatheredLoadsEntriesFirst;
4553 SmallDenseMap<
const TreeEntry *,
4554 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4555 CompressEntryToData;
4558 struct ExternalUser {
4559 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4560 : Scalar(S), User(
U), E(E), Lane(
L) {}
4563 Value *Scalar =
nullptr;
4566 llvm::User *User =
nullptr;
4574 using UserList = SmallVector<ExternalUser, 16>;
4580 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4581 Instruction *Inst2) {
4584 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4585 auto Res = AliasCache.try_emplace(
Key);
4587 return Res.first->second;
4588 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4590 Res.first->getSecond() = Aliased;
4594 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4598 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4603 BatchAAResults BatchAA;
4610 DenseSet<Instruction *> DeletedInstructions;
4613 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4616 DenseSet<size_t> AnalyzedReductionVals;
4620 DenseSet<Value *> AnalyzedMinBWVals;
4626 UserList ExternalUses;
4630 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4634 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4637 SmallPtrSet<const Value *, 32> EphValues;
4641 SetVector<Instruction *> GatherShuffleExtractSeq;
4644 DenseSet<BasicBlock *> CSEBlocks;
4647 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4654 class ScheduleEntity {
4655 friend class ScheduleBundle;
4656 friend class ScheduleData;
4657 friend class ScheduleCopyableData;
4660 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4661 Kind getKind()
const {
return K; }
4662 ScheduleEntity(Kind K) : K(K) {}
4666 int SchedulingPriority = 0;
4669 bool IsScheduled =
false;
4671 const Kind K = Kind::ScheduleData;
4674 ScheduleEntity() =
delete;
4676 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4677 int getSchedulingPriority()
const {
return SchedulingPriority; }
4678 bool isReady()
const {
4680 return SD->isReady();
4682 return CD->isReady();
4688 bool hasValidDependencies()
const {
4690 return SD->hasValidDependencies();
4692 return CD->hasValidDependencies();
4696 int getUnscheduledDeps()
const {
4698 return SD->getUnscheduledDeps();
4700 return CD->getUnscheduledDeps();
4704 int incrementUnscheduledDeps(
int Incr) {
4706 return SD->incrementUnscheduledDeps(Incr);
4710 int getDependencies()
const {
4712 return SD->getDependencies();
4718 return SD->getInst();
4723 bool isScheduled()
const {
return IsScheduled; }
4724 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4726 static bool classof(
const ScheduleEntity *) {
return true; }
4728#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4729 void dump(raw_ostream &OS)
const {
4731 return SD->dump(OS);
4733 return CD->dump(OS);
4744#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4746 const BoUpSLP::ScheduleEntity &SE) {
4756 class ScheduleData final :
public ScheduleEntity {
4760 enum { InvalidDeps = -1 };
4762 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4763 static bool classof(
const ScheduleEntity *Entity) {
4764 return Entity->getKind() == Kind::ScheduleData;
4767 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4768 NextLoadStore =
nullptr;
4769 IsScheduled =
false;
4770 SchedulingRegionID = BlockSchedulingRegionID;
4771 clearDependencies();
4777 if (hasValidDependencies()) {
4778 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4780 assert(UnscheduledDeps == Dependencies &&
"invariant");
4784 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4785 "unexpected scheduled state");
4792 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4796 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4801 int incrementUnscheduledDeps(
int Incr) {
4802 assert(hasValidDependencies() &&
4803 "increment of unscheduled deps would be meaningless");
4804 UnscheduledDeps += Incr;
4805 return UnscheduledDeps;
4810 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4813 void clearDependencies() {
4814 clearDirectDependencies();
4815 MemoryDependencies.clear();
4816 ControlDependencies.clear();
4823 void clearDirectDependencies() {
4824 Dependencies = InvalidDeps;
4825 resetUnscheduledDeps();
4826 IsScheduled =
false;
4830 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4832 int getDependencies()
const {
return Dependencies; }
4834 void initDependencies() { Dependencies = 0; }
4836 void incDependencies() { Dependencies++; }
4839 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4846 return MemoryDependencies;
4849 void addMemoryDependency(ScheduleData *Dep) {
4850 MemoryDependencies.push_back(Dep);
4854 return ControlDependencies;
4857 void addControlDependency(ScheduleData *Dep) {
4858 ControlDependencies.push_back(Dep);
4861 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4862 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4864 void dump(raw_ostream &OS)
const { OS << *Inst; }
4876 ScheduleData *NextLoadStore =
nullptr;
4880 SmallVector<ScheduleData *> MemoryDependencies;
4886 SmallVector<ScheduleData *> ControlDependencies;
4890 int SchedulingRegionID = 0;
4896 int Dependencies = InvalidDeps;
4902 int UnscheduledDeps = InvalidDeps;
4907 const BoUpSLP::ScheduleData &SD) {
4913 class ScheduleBundle final :
public ScheduleEntity {
4917 bool IsValid =
true;
4919 TreeEntry *TE =
nullptr;
4920 ScheduleBundle(
bool IsValid)
4921 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4924 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4925 static bool classof(
const ScheduleEntity *Entity) {
4926 return Entity->getKind() == Kind::ScheduleBundle;
4931 for (
const ScheduleEntity *SD : Bundle) {
4932 if (SD->hasValidDependencies()) {
4933 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4936 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4940 if (isScheduled()) {
4941 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4942 "unexpected scheduled state");
4948 int unscheduledDepsInBundle()
const {
4949 assert(*
this &&
"bundle must not be empty");
4951 for (
const ScheduleEntity *BundleMember : Bundle) {
4952 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4953 return ScheduleData::InvalidDeps;
4954 Sum += BundleMember->getUnscheduledDeps();
4962 bool hasValidDependencies()
const {
4963 return all_of(Bundle, [](
const ScheduleEntity *SD) {
4964 return SD->hasValidDependencies();
4970 bool isReady()
const {
4971 assert(*
this &&
"bundle must not be empty");
4972 return unscheduledDepsInBundle() == 0 && !isScheduled();
4980 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
4983 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
4984 TreeEntry *getTreeEntry()
const {
return TE; }
4986 static ScheduleBundle invalid() {
return {
false}; }
4988 operator bool()
const {
return IsValid; }
4991 void dump(raw_ostream &OS)
const {
5000 OS << *SD->getInst();
5014 const BoUpSLP::ScheduleBundle &Bundle) {
5025 class ScheduleCopyableData final :
public ScheduleEntity {
5032 int SchedulingRegionID = 0;
5034 ScheduleBundle &Bundle;
5037 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5038 const EdgeInfo &EI, ScheduleBundle &Bundle)
5039 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5040 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5041 static bool classof(
const ScheduleEntity *Entity) {
5042 return Entity->getKind() == Kind::ScheduleCopyableData;
5047 if (hasValidDependencies()) {
5048 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5050 assert(UnscheduledDeps == Dependencies &&
"invariant");
5054 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5055 "unexpected scheduled state");
5062 bool hasValidDependencies()
const {
5063 return Dependencies != ScheduleData::InvalidDeps;
5068 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5073 int incrementUnscheduledDeps(
int Incr) {
5074 assert(hasValidDependencies() &&
5075 "increment of unscheduled deps would be meaningless");
5076 UnscheduledDeps += Incr;
5077 assert(UnscheduledDeps >= 0 &&
"invariant");
5078 return UnscheduledDeps;
5083 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5086 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5088 int getDependencies()
const {
return Dependencies; }
5090 void initDependencies() { Dependencies = 0; }
5092 void incDependencies() { Dependencies++; }
5095 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5101 void clearDependencies() {
5102 Dependencies = ScheduleData::InvalidDeps;
5103 UnscheduledDeps = ScheduleData::InvalidDeps;
5104 IsScheduled =
false;
5108 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5111 ScheduleBundle &getBundle() {
return Bundle; }
5112 const ScheduleBundle &getBundle()
const {
return Bundle; }
5114#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5115 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5126 int Dependencies = ScheduleData::InvalidDeps;
5132 int UnscheduledDeps = ScheduleData::InvalidDeps;
5162 struct BlockScheduling {
5164 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5167 ScheduledBundles.clear();
5168 ScheduledBundlesList.
clear();
5169 ScheduleCopyableDataMap.clear();
5170 ScheduleCopyableDataMapByInst.clear();
5171 ScheduleCopyableDataMapByInstUser.clear();
5172 ScheduleCopyableDataMapByUsers.clear();
5174 ScheduleStart =
nullptr;
5175 ScheduleEnd =
nullptr;
5176 FirstLoadStoreInRegion =
nullptr;
5177 LastLoadStoreInRegion =
nullptr;
5178 RegionHasStackSave =
false;
5182 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5185 ScheduleRegionSize = 0;
5189 ++SchedulingRegionID;
5195 if (BB !=
I->getParent())
5198 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5199 if (SD && isInSchedulingRegion(*SD))
5204 ScheduleData *getScheduleData(
Value *V) {
5210 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5211 const Value *V)
const {
5212 if (ScheduleCopyableDataMap.empty())
5214 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5215 if (It == ScheduleCopyableDataMap.end())
5217 ScheduleCopyableData *SD = It->getSecond().get();
5218 if (!isInSchedulingRegion(*SD))
5226 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5228 if (ScheduleCopyableDataMapByInstUser.empty())
5230 const auto It = ScheduleCopyableDataMapByInstUser.find(
5231 std::make_pair(std::make_pair(User, OperandIdx), V));
5232 if (It == ScheduleCopyableDataMapByInstUser.end())
5235 for (ScheduleCopyableData *SD : It->getSecond()) {
5236 if (isInSchedulingRegion(*SD))
5250 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5254 if (ScheduleCopyableDataMap.empty())
5256 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5257 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5258 for (
const Use &U :
User->operands()) {
5262 if (Entries.
empty())
5266 for (TreeEntry *TE : Entries) {
5272 bool IsCommutativeUser =
5275 EdgeInfo EI(TE,
U.getOperandNo());
5278 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5279 if (!getScheduleCopyableData(EI,
Op) && OpCnt <
NumOps)
5285 ++PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5286 .first->getSecond();
5290 if (!PotentiallyReorderedEntriesCount.
empty()) {
5291 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5292 auto *It =
find(
P.first->Scalars, User);
5293 assert(It !=
P.first->Scalars.end() &&
5294 "User is not in the tree entry");
5295 int Lane = std::distance(
P.first->Scalars.begin(), It);
5296 assert(Lane >= 0 &&
"Lane is not found");
5298 Lane =
P.first->ReorderIndices[Lane];
5299 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5300 "Couldn't find extract lane");
5301 SmallVector<unsigned> OpIndices;
5302 for (
unsigned OpIdx :
5304 P.first->getMainOp()))) {
5305 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5306 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5310 return all_of(PotentiallyReorderedEntriesCount,
5311 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5312 return P.second ==
NumOps - 1;
5319 getScheduleCopyableData(
const Instruction *
I)
const {
5320 if (ScheduleCopyableDataMapByInst.empty())
5322 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5323 if (It == ScheduleCopyableDataMapByInst.end())
5326 for (ScheduleCopyableData *SD : It->getSecond()) {
5327 if (isInSchedulingRegion(*SD))
5334 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5335 if (ScheduleCopyableDataMapByUsers.empty())
5337 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5338 if (It == ScheduleCopyableDataMapByUsers.end())
5341 for (ScheduleCopyableData *SD : It->getSecond()) {
5342 if (isInSchedulingRegion(*SD))
5348 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5350 int SchedulingRegionID,
5351 ScheduleBundle &Bundle) {
5352 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5353 ScheduleCopyableData *CD =
5354 ScheduleCopyableDataMap
5355 .try_emplace(std::make_pair(EI,
I),
5356 std::make_unique<ScheduleCopyableData>(
5357 SchedulingRegionID,
I, EI, Bundle))
5360 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5364 assert(It !=
Op.end() &&
"Lane not set");
5365 SmallPtrSet<Instruction *, 4> Visited;
5367 int Lane = std::distance(
Op.begin(), It);
5368 assert(Lane >= 0 &&
"Lane not set");
5370 !EI.UserTE->ReorderIndices.empty())
5371 Lane = EI.UserTE->ReorderIndices[Lane];
5372 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5373 "Couldn't find extract lane");
5375 if (!Visited.
insert(In).second) {
5379 ScheduleCopyableDataMapByInstUser
5380 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5383 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5390 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5391 if (ScheduleCopyableData *UserCD =
5392 getScheduleCopyableData(UserEI, In))
5393 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5396 }
while (It !=
Op.end());
5398 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5408 auto It = ScheduledBundles.find(
I);
5409 if (It == ScheduledBundles.end())
5411 return It->getSecond();
5415 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5417 return Data->getSchedulingRegionID() == SchedulingRegionID;
5419 return CD->getSchedulingRegionID() == SchedulingRegionID;
5421 [&](
const ScheduleEntity *BundleMember) {
5422 return isInSchedulingRegion(*BundleMember);
5428 template <
typename ReadyListType>
5429 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5430 const EdgeInfo &EI, ScheduleEntity *
Data,
5431 ReadyListType &ReadyList) {
5432 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5437 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5438 if ((IsControl ||
Data->hasValidDependencies()) &&
5439 Data->incrementUnscheduledDeps(-1) == 0) {
5446 CopyableBundle.
push_back(&CD->getBundle());
5447 Bundles = CopyableBundle;
5449 Bundles = getScheduleBundles(
Data->getInst());
5451 if (!Bundles.
empty()) {
5452 for (ScheduleBundle *Bundle : Bundles) {
5453 if (Bundle->unscheduledDepsInBundle() == 0) {
5454 assert(!Bundle->isScheduled() &&
5455 "already scheduled bundle gets ready");
5456 ReadyList.insert(Bundle);
5458 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5464 "already scheduled bundle gets ready");
5466 "Expected non-copyable data");
5467 ReadyList.insert(
Data);
5474 if (!ScheduleCopyableDataMap.empty()) {
5476 getScheduleCopyableData(User,
OpIdx,
I);
5477 for (ScheduleCopyableData *CD : CopyableData)
5478 DecrUnsched(CD,
false);
5479 if (!CopyableData.empty())
5482 if (ScheduleData *OpSD = getScheduleData(
I))
5483 DecrUnsched(OpSD,
false);
5489 if (!Bundles.empty()) {
5490 auto *
In = BundleMember->getInst();
5492 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5493 unsigned TotalOpCount = 0;
5496 TotalOpCount = OperandsUses[
In] = 1;
5498 for (
const Use &U :
In->operands()) {
5501 ++Res.first->getSecond();
5508 auto DecrUnschedForInst = [&](
Instruction *
I, TreeEntry *UserTE,
5510 if (!ScheduleCopyableDataMap.empty()) {
5511 const EdgeInfo EI = {UserTE,
OpIdx};
5512 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI,
I)) {
5513 DecrUnsched(CD,
false);
5517 auto It = OperandsUses.
find(
I);
5518 assert(It != OperandsUses.
end() &&
"Operand not found");
5519 if (It->second > 0) {
5521 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5523 if (ScheduleData *OpSD = getScheduleData(
I))
5524 DecrUnsched(OpSD,
false);
5528 for (ScheduleBundle *Bundle : Bundles) {
5529 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5533 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5534 find(Bundle->getTreeEntry()->Scalars, In));
5535 assert(Lane >= 0 &&
"Lane not set");
5537 !Bundle->getTreeEntry()->ReorderIndices.empty())
5538 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5539 assert(Lane <
static_cast<int>(
5540 Bundle->getTreeEntry()->Scalars.size()) &&
5541 "Couldn't find extract lane");
5551 In->getNumOperands() ==
5552 Bundle->getTreeEntry()->getNumOperands() ||
5553 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5554 "Missed TreeEntry operands?");
5556 for (
unsigned OpIdx :
5559 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5562 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx);
5568 for (Use &U : BundleMember->getInst()->operands()) {
5571 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5572 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5580 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5581 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5582 if (!VisitedMemory.
insert(MemoryDep).second)
5587 << *MemoryDep <<
"\n");
5588 DecrUnsched(MemoryDep);
5591 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5592 for (ScheduleData *Dep : SD->getControlDependencies()) {
5593 if (!VisitedControl.
insert(Dep).second)
5598 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5599 DecrUnsched(Dep,
true);
5603 SD->setScheduled(
true);
5608 if (
R.isVectorized(In)) {
5610 for (TreeEntry *TE : Entries) {
5612 In->getNumOperands() !=
TE->getNumOperands())
5615 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5616 BundlePtr->setTreeEntry(TE);
5621 ProcessBundleMember(SD, Bundles);
5624 Bundle.setScheduled(
true);
5626 auto AreAllBundlesScheduled =
5627 [&](
const ScheduleEntity *SD,
5631 return !SDBundles.empty() &&
5632 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5633 return SDBundle->isScheduled();
5636 for (ScheduleEntity *SD : Bundle.getBundle()) {
5639 SDBundles = getScheduleBundles(SD->getInst());
5640 if (AreAllBundlesScheduled(SD, SDBundles)) {
5641 SD->setScheduled(
true);
5654 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5655 ScheduleStart->comesBefore(ScheduleEnd) &&
5656 "Not a valid scheduling region?");
5658 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5660 if (!Bundles.
empty()) {
5661 for (ScheduleBundle *Bundle : Bundles) {
5662 assert(isInSchedulingRegion(*Bundle) &&
5663 "primary schedule data not in window?");
5668 auto *SD = getScheduleData(
I);
5671 assert(isInSchedulingRegion(*SD) &&
5672 "primary schedule data not in window?");
5677 [](
const ScheduleEntity *Bundle) {
5678 return Bundle->isReady();
5680 "item in ready list not ready?");
5684 template <
typename ReadyListType>
5685 void initialFillReadyList(ReadyListType &ReadyList) {
5686 SmallPtrSet<ScheduleBundle *, 16> Visited;
5687 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5688 ScheduleData *SD = getScheduleData(
I);
5689 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5692 for (ScheduleBundle *Bundle : Bundles) {
5693 if (!Visited.
insert(Bundle).second)
5695 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5696 ReadyList.insert(Bundle);
5698 << *Bundle <<
"\n");
5703 ReadyList.insert(SD);
5705 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5716 const InstructionsState &S,
const EdgeInfo &EI);
5723 std::optional<ScheduleBundle *>
5725 const InstructionsState &S,
const EdgeInfo &EI);
5728 ScheduleData *allocateScheduleDataChunks();
5732 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5736 void initScheduleData(Instruction *FromI, Instruction *ToI,
5737 ScheduleData *PrevLoadStore,
5738 ScheduleData *NextLoadStore);
5742 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5747 void resetSchedule();
5764 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5768 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5769 std::unique_ptr<ScheduleCopyableData>>
5770 ScheduleCopyableDataMap;
5776 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5777 ScheduleCopyableDataMapByInst;
5783 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5785 ScheduleCopyableDataMapByInstUser;
5805 SmallSetVector<ScheduleCopyableData *, 4>>
5806 ScheduleCopyableDataMapByUsers;
5809 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5815 SetVector<ScheduleEntity *> ReadyInsts;
5825 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5829 ScheduleData *LastLoadStoreInRegion =
nullptr;
5834 bool RegionHasStackSave =
false;
5837 int ScheduleRegionSize = 0;
5846 int SchedulingRegionID = 1;
5850 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5854 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
5857 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
5861 struct OrdersTypeDenseMapInfo {
5874 static unsigned getHashValue(
const OrdersType &V) {
5885 ScalarEvolution *SE;
5886 TargetTransformInfo *TTI;
5887 TargetLibraryInfo *TLI;
5890 AssumptionCache *AC;
5892 const DataLayout *DL;
5893 OptimizationRemarkEmitter *ORE;
5895 unsigned MaxVecRegSize;
5896 unsigned MinVecRegSize;
5899 IRBuilder<TargetFolder> Builder;
5906 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
5911 unsigned ReductionBitWidth = 0;
5914 unsigned BaseGraphSize = 1;
5918 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5922 DenseSet<unsigned> ExtraBitWidthNodes;
5932 SecondInfo::getEmptyKey());
5937 SecondInfo::getTombstoneKey());
5942 SecondInfo::getHashValue(Val.
EdgeIdx));
5963 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5974 return R.VectorizableTree[0].get();
5978 return {&
N->UserTreeIndex,
N->Container};
5982 return {&
N->UserTreeIndex + 1,
N->Container};
6009 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6020 OS << Entry->Idx <<
".\n";
6023 for (
auto *V : Entry->Scalars) {
6025 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6026 return EU.Scalar == V;
6036 if (Entry->isGather())
6038 if (Entry->State == TreeEntry::ScatterVectorize ||
6039 Entry->State == TreeEntry::StridedVectorize ||
6040 Entry->State == TreeEntry::CompressVectorize)
6041 return "color=blue";
6050 for (
auto *
I : DeletedInstructions) {
6051 if (!
I->getParent()) {
6056 I->insertBefore(F->getEntryBlock(),
6057 F->getEntryBlock().getFirstNonPHIIt());
6059 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6062 for (
Use &U :
I->operands()) {
6064 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6068 I->dropAllReferences();
6070 for (
auto *
I : DeletedInstructions) {
6072 "trying to erase instruction with users.");
6073 I->eraseFromParent();
6079#ifdef EXPENSIVE_CHECKS
6090 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6091 "Expected non-empty mask.");
6094 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6096 Reuses[Mask[
I]] = Prev[
I];
6104 bool BottomOrder =
false) {
6105 assert(!Mask.empty() &&
"Expected non-empty mask.");
6106 unsigned Sz = Mask.size();
6109 if (Order.
empty()) {
6111 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6113 PrevOrder.
swap(Order);
6116 for (
unsigned I = 0;
I < Sz; ++
I)
6118 Order[
I] = PrevOrder[Mask[
I]];
6120 return Data.value() == Sz ||
Data.index() ==
Data.value();
6129 if (Order.
empty()) {
6131 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6141 for (
unsigned I = 0;
I < Sz; ++
I)
6143 Order[MaskOrder[
I]] =
I;
6147std::optional<BoUpSLP::OrdersType>
6149 bool TopToBottom,
bool IgnoreReorder) {
6150 assert(TE.isGather() &&
"Expected gather node only.");
6154 Type *ScalarTy = GatheredScalars.
front()->getType();
6155 size_t NumScalars = GatheredScalars.
size();
6157 return std::nullopt;
6164 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6166 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6169 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6170 return std::nullopt;
6171 OrdersType CurrentOrder(NumScalars, NumScalars);
6172 if (GatherShuffles.
size() == 1 &&
6174 Entries.
front().front()->isSame(TE.Scalars)) {
6178 return std::nullopt;
6180 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6181 TE.UserTreeIndex.UserTE)
6182 return std::nullopt;
6185 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6186 return std::nullopt;
6189 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6190 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6193 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6195 return std::nullopt;
6199 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6200 return CurrentOrder;
6204 return all_of(Mask, [&](
int I) {
6211 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6212 (Entries.
size() != 1 ||
6213 Entries.
front().front()->ReorderIndices.empty())) ||
6214 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6215 return std::nullopt;
6221 if (ShuffledSubMasks.
test(
I))
6223 const int VF = GetVF(
I);
6229 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6231 ShuffledSubMasks.
set(
I);
6235 int FirstMin = INT_MAX;
6236 int SecondVecFound =
false;
6238 int Idx = Mask[
I * PartSz + K];
6240 Value *V = GatheredScalars[
I * PartSz + K];
6242 SecondVecFound =
true;
6251 SecondVecFound =
true;
6255 FirstMin = (FirstMin / PartSz) * PartSz;
6257 if (SecondVecFound) {
6259 ShuffledSubMasks.
set(
I);
6263 int Idx = Mask[
I * PartSz + K];
6267 if (Idx >= PartSz) {
6268 SecondVecFound =
true;
6271 if (CurrentOrder[
I * PartSz + Idx] >
6272 static_cast<unsigned>(
I * PartSz + K) &&
6273 CurrentOrder[
I * PartSz + Idx] !=
6274 static_cast<unsigned>(
I * PartSz + Idx))
6275 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6278 if (SecondVecFound) {
6280 ShuffledSubMasks.
set(
I);
6286 if (!ExtractShuffles.
empty())
6287 TransformMaskToOrder(
6288 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6289 if (!ExtractShuffles[
I])
6292 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6294 int K =
I * PartSz + Idx;
6297 if (!TE.ReuseShuffleIndices.empty())
6298 K = TE.ReuseShuffleIndices[K];
6301 if (!TE.ReorderIndices.empty())
6302 K = std::distance(TE.ReorderIndices.begin(),
6303 find(TE.ReorderIndices, K));
6309 .getKnownMinValue());
6314 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6315 if (ShuffledSubMasks.
any())
6316 return std::nullopt;
6317 PartSz = NumScalars;
6320 if (!Entries.
empty())
6321 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6322 if (!GatherShuffles[
I])
6324 return std::max(Entries[
I].front()->getVectorFactor(),
6325 Entries[
I].back()->getVectorFactor());
6327 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6328 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6329 return std::nullopt;
6330 return std::move(CurrentOrder);
6335 bool CompareOpcodes =
true) {
6341 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6342 (!GEP2 || GEP2->getNumOperands() == 2) &&
6343 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6344 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6347 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6351template <
typename T>
6356 return CommonAlignment;
6362 "Order is empty. Please check it before using isReverseOrder.");
6363 unsigned Sz = Order.
size();
6365 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6376 const SCEV *PtrSCEVLowest =
nullptr;
6377 const SCEV *PtrSCEVHighest =
nullptr;
6385 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6386 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6393 PtrSCEVLowest = PtrSCEV;
6400 PtrSCEVHighest = PtrSCEV;
6408 int Size =
DL.getTypeStoreSize(ElemTy);
6409 auto TryGetStride = [&](
const SCEV *Dist,
6410 const SCEV *Multiplier) ->
const SCEV * {
6412 if (M->getOperand(0) == Multiplier)
6413 return M->getOperand(1);
6414 if (M->getOperand(1) == Multiplier)
6415 return M->getOperand(0);
6418 if (Multiplier == Dist)
6423 const SCEV *Stride =
nullptr;
6424 if (
Size != 1 || SCEVs.
size() > 2) {
6426 Stride = TryGetStride(Dist, Sz);
6434 using DistOrdPair = std::pair<int64_t, int>;
6436 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6438 bool IsConsecutive =
true;
6439 for (
const SCEV *PtrSCEV : SCEVs) {
6441 if (PtrSCEV != PtrSCEVLowest) {
6443 const SCEV *Coeff = TryGetStride(Diff, Stride);
6453 Dist = SC->getAPInt().getZExtValue();
6458 auto Res = Offsets.emplace(Dist, Cnt);
6462 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6465 if (Offsets.size() != SCEVs.
size())
6467 SortedIndices.
clear();
6468 if (!IsConsecutive) {
6472 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6473 SortedIndices[Cnt] = Pair.second;
6480static std::pair<InstructionCost, InstructionCost>
6483 Type *ScalarTy, VectorType *VecTy);
6501 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6504 Mask, NumSrcElts, NumSubElts, Index)) {
6505 if (Index + NumSubElts > NumSrcElts &&
6506 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6523 "ScalableVectorType is not supported.");
6526 "Incorrect usage.");
6531 unsigned ScalarTyNumElements = VecTy->getNumElements();
6534 if (!DemandedElts[
I])
6538 I * ScalarTyNumElements, VecTy);
6541 I * ScalarTyNumElements, VecTy);
6554 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6555 if (Opcode == Instruction::ExtractElement) {
6561 Index * VecTy->getNumElements(), VecTy);
6564 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6577 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6579 Index * ScalarTy->getNumElements(), SubTp) +
6583 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6599 auto *Begin = std::next(
Mask.begin(), Index);
6600 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6604 std::iota(
Mask.begin(),
Mask.end(), 0);
6605 std::iota(std::next(
Mask.begin(), Index),
6606 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6608 return Generator(Vec, V, Mask);
6611 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6619 unsigned SubVecVF,
unsigned Index) {
6621 std::iota(Mask.begin(), Mask.end(), Index);
6622 return Builder.CreateShuffleVector(Vec, Mask);
6632 const unsigned Sz = PointerOps.
size();
6635 CompressMask[0] = 0;
6637 std::optional<unsigned> Stride = 0;
6641 std::optional<int64_t> OptPos =
6643 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6645 unsigned Pos =
static_cast<unsigned>(*OptPos);
6646 CompressMask[
I] = Pos;
6653 if (Pos != *Stride *
I)
6656 return Stride.has_value();
6669 InterleaveFactor = 0;
6671 const size_t Sz = VL.
size();
6679 if (AreAllUsersVectorized(V))
6682 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6683 Mask.empty() ?
I : Mask[
I]);
6686 if (ExtractCost <= ScalarCost)
6691 if (Order.
empty()) {
6692 Ptr0 = PointerOps.
front();
6693 PtrN = PointerOps.
back();
6695 Ptr0 = PointerOps[Order.
front()];
6696 PtrN = PointerOps[Order.
back()];
6698 std::optional<int64_t> Diff =
6702 const size_t MaxRegSize =
6706 if (*Diff / Sz >= MaxRegSize / 8)
6710 Align CommonAlignment = LI->getAlign();
6712 Ptr0, LoadVecTy, CommonAlignment,
DL,
6715 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6716 LI->getPointerAddressSpace()))
6722 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6726 auto [ScalarGEPCost, VectorGEPCost] =
6728 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6746 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6747 LI->getPointerAddressSpace(),
CostKind);
6750 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6751 LI->getPointerAddressSpace(),
CostKind);
6753 if (IsStrided && !IsMasked && Order.
empty()) {
6760 AlignedLoadVecTy = LoadVecTy;
6761 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6763 LI->getPointerAddressSpace())) {
6765 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6766 Instruction::Load, AlignedLoadVecTy,
6767 CompressMask[1], {}, CommonAlignment,
6768 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6769 if (InterleavedCost < GatherCost) {
6770 InterleaveFactor = CompressMask[1];
6771 LoadVecTy = AlignedLoadVecTy;
6778 if (!Order.
empty()) {
6781 NewMask[
I] = CompressMask[Mask[
I]];
6783 CompressMask.
swap(NewMask);
6785 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6786 return TotalVecCost < GatherCost;
6799 unsigned InterleaveFactor;
6803 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6804 CompressMask, LoadVecTy);
6825 StridedPtrInfo &SPtrInfo)
const {
6826 const size_t Sz = VL.
size();
6827 if (Diff % (Sz - 1) != 0)
6831 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
6833 return !isVectorized(U) && !MustGather.contains(U);
6837 const uint64_t AbsoluteDiff = std::abs(Diff);
6840 if (IsAnyPointerUsedOutGraph ||
6841 (AbsoluteDiff > Sz &&
6844 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
6845 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
6846 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6847 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
6852 if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
6856 if (Order.
empty()) {
6857 Ptr0 = PointerOps.
front();
6858 PtrN = PointerOps.
back();
6860 Ptr0 = PointerOps[Order.
front()];
6861 PtrN = PointerOps[Order.
back()];
6870 else if (
Ptr != Ptr0)
6874 if (((Dist / Stride) * Stride) != Dist || !Dists.
insert(Dist).second)
6877 if (Dists.
size() == Sz) {
6878 Type *StrideTy = DL.getIndexType(Ptr0->
getType());
6879 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6890 unsigned *BestVF,
bool TryRecursiveCheck)
const {
6903 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
6909 const size_t Sz = VL.
size();
6911 auto *POIter = PointerOps.
begin();
6912 for (
Value *V : VL) {
6914 if (!L || !L->isSimple())
6916 *POIter = L->getPointerOperand();
6922 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
6928 if (
const SCEV *Stride =
6930 Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
6932 SPtrInfo.StrideSCEV = Stride;
6937 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6938 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6949 if (Order.
empty()) {
6950 Ptr0 = PointerOps.
front();
6951 PtrN = PointerOps.
back();
6953 Ptr0 = PointerOps[Order.
front()];
6954 PtrN = PointerOps[Order.
back()];
6956 std::optional<int64_t> Diff =
6959 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
6962 *TLI, [&](
Value *V) {
6963 return areAllUsersVectorized(
6967 if (
isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE, *Diff, SPtrInfo))
6970 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6971 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6976 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
6978 bool ProfitableGatherPointers) {
6983 auto [ScalarGEPCost, VectorGEPCost] =
6985 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
6989 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
6991 if (
static_cast<unsigned>(
count_if(
7010 return C + TTI.getInstructionCost(
7016 TTI.getGatherScatterOpCost(
7018 false, CommonAlignment,
CostKind) +
7019 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7027 constexpr unsigned ListLimit = 4;
7028 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7037 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7047 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7052 PointerOps, SPtrInfo, BestVF,
7060 DemandedElts.
setBits(Cnt, Cnt + VF);
7076 if (!DemandedElts.
isZero()) {
7082 if (DemandedElts[Idx])
7093 LI0->getPointerOperand(),
7094 Instruction::GetElementPtr,
CostKind, ScalarTy,
7098 if (
static_cast<unsigned>(
7100 PointerOps.
size() - 1 ||
7119 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7120 LI0->getPointerAddressSpace(),
CostKind,
7125 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7126 LI0->getPointerOperand(),
7132 VecLdCost += TTI.getMaskedMemoryOpCost(
7133 Instruction::Load, SubVecTy, CommonAlignment,
7134 LI0->getPointerAddressSpace(),
CostKind) +
7140 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7141 LI0->getPointerOperand(),
7152 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7161 if (MaskedGatherCost >= VecLdCost &&
7174 bool ProfitableGatherPointers =
7175 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7176 return L->isLoopInvariant(V);
7178 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7181 (
GEP &&
GEP->getNumOperands() == 2 &&
7189 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7190 ProfitableGatherPointers))
7202 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7203 "Expected list of pointer operands.");
7208 std::pair<BasicBlock *, Value *>,
7212 .try_emplace(std::make_pair(
7216 SortedIndices.
clear();
7218 auto Key = std::make_pair(BBs[Cnt + 1],
7220 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7222 std::optional<int64_t> Diff =
7223 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7224 ElemTy, Ptr, DL, SE,
7229 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7235 if (Bases.size() > VL.
size() / 2 - 1)
7239 Bases.find(
Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
7243 if (Bases.size() == VL.
size())
7246 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7247 Bases.front().second.size() == VL.
size()))
7252 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7261 FirstPointers.
insert(P1);
7262 SecondPointers.
insert(P2);
7268 "Unable to find matching root.");
7271 for (
auto &
Base : Bases) {
7272 for (
auto &Vec :
Base.second) {
7273 if (Vec.size() > 1) {
7275 int64_t InitialOffset = std::get<1>(Vec[0]);
7276 bool AnyConsecutive =
7278 return std::get<1>(
P.value()) ==
7279 int64_t(
P.index()) + InitialOffset;
7283 if (!AnyConsecutive)
7288 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7292 for (
auto &
T : Bases)
7293 for (
const auto &Vec :
T.second)
7294 for (
const auto &
P : Vec)
7298 "Expected SortedIndices to be the size of VL");
7302std::optional<BoUpSLP::OrdersType>
7304 assert(TE.isGather() &&
"Expected gather node only.");
7305 Type *ScalarTy = TE.Scalars[0]->getType();
7308 Ptrs.
reserve(TE.Scalars.size());
7310 BBs.
reserve(TE.Scalars.size());
7311 for (
Value *V : TE.Scalars) {
7313 if (!L || !L->isSimple())
7314 return std::nullopt;
7320 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7322 return std::move(Order);
7323 return std::nullopt;
7334 if (VU->
getType() != V->getType())
7337 if (!VU->
hasOneUse() && !V->hasOneUse())
7343 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7350 bool IsReusedIdx =
false;
7352 if (IE2 == VU && !IE1)
7354 if (IE1 == V && !IE2)
7355 return V->hasOneUse();
7356 if (IE1 && IE1 != V) {
7358 IsReusedIdx |= ReusedIdx.
test(Idx1);
7359 ReusedIdx.
set(Idx1);
7360 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7365 if (IE2 && IE2 != VU) {
7367 IsReusedIdx |= ReusedIdx.
test(Idx2);
7368 ReusedIdx.
set(Idx2);
7369 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7374 }
while (!IsReusedIdx && (IE1 || IE2));
7382 const TargetLibraryInfo &TLI);
7384std::optional<BoUpSLP::OrdersType>
7386 bool IgnoreReorder) {
7389 if (!TE.ReuseShuffleIndices.empty()) {
7391 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7392 "Reshuffling scalars not yet supported for nodes with padding");
7395 return std::nullopt;
7403 unsigned Sz = TE.Scalars.size();
7404 if (TE.isGather()) {
7405 if (std::optional<OrdersType> CurrentOrder =
7410 ::addMask(Mask, TE.ReuseShuffleIndices);
7411 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7412 unsigned Sz = TE.Scalars.size();
7413 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7416 Res[Idx + K * Sz] =
I + K * Sz;
7418 return std::move(Res);
7421 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7423 2 * TE.getVectorFactor())) == 1)
7424 return std::nullopt;
7425 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7426 return std::nullopt;
7430 if (TE.ReorderIndices.empty())
7431 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7434 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7435 unsigned VF = ReorderMask.
size();
7439 for (
unsigned I = 0;
I < VF;
I += Sz) {
7441 unsigned UndefCnt = 0;
7442 unsigned Limit = std::min(Sz, VF -
I);
7451 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7453 return std::nullopt;
7455 for (
unsigned K = 0; K < NumParts; ++K) {
7456 unsigned Idx = Val + Sz * K;
7457 if (Idx < VF &&
I + K < VF)
7458 ResOrder[Idx] =
I + K;
7461 return std::move(ResOrder);
7463 unsigned VF = TE.getVectorFactor();
7466 TE.ReuseShuffleIndices.end());
7467 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7469 if (isa<PoisonValue>(V))
7471 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7472 return Idx && *Idx < Sz;
7474 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7475 "by BinaryOperator and CastInst.");
7477 if (TE.ReorderIndices.empty())
7478 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7481 for (
unsigned I = 0;
I < VF; ++
I) {
7482 int &Idx = ReusedMask[
I];
7485 Value *V = TE.Scalars[ReorderMask[Idx]];
7487 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7493 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7494 auto *It = ResOrder.
begin();
7495 for (
unsigned K = 0; K < VF; K += Sz) {
7499 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7501 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7502 std::advance(It, Sz);
7505 return Data.index() ==
Data.value();
7507 return std::nullopt;
7508 return std::move(ResOrder);
7510 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7511 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7513 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7514 return std::nullopt;
7515 if (TE.State == TreeEntry::SplitVectorize ||
7516 ((TE.State == TreeEntry::Vectorize ||
7517 TE.State == TreeEntry::StridedVectorize ||
7518 TE.State == TreeEntry::CompressVectorize) &&
7521 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7522 "Alternate instructions are only supported by "
7523 "BinaryOperator and CastInst.");
7524 return TE.ReorderIndices;
7526 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7527 TE.isAltShuffle()) {
7528 assert(TE.ReuseShuffleIndices.empty() &&
7529 "ReuseShuffleIndices should be "
7530 "empty for alternate instructions.");
7532 TE.buildAltOpShuffleMask(
7534 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7535 "Unexpected main/alternate opcode");
7539 const int VF = TE.getVectorFactor();
7544 ResOrder[Mask[
I] % VF] =
I;
7546 return std::move(ResOrder);
7548 if (!TE.ReorderIndices.empty())
7549 return TE.ReorderIndices;
7550 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7551 if (!TE.ReorderIndices.empty())
7552 return TE.ReorderIndices;
7555 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7563 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7571 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7572 if (!DT->isReachableFromEntry(BB1))
7574 if (!DT->isReachableFromEntry(BB2))
7576 auto *NodeA = DT->getNode(BB1);
7577 auto *NodeB = DT->getNode(BB2);
7578 assert(NodeA &&
"Should only process reachable instructions");
7579 assert(NodeB &&
"Should only process reachable instructions");
7580 assert((NodeA == NodeB) ==
7581 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7582 "Different nodes should have different DFS numbers");
7583 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7585 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7586 Value *V1 = TE.Scalars[I1];
7587 Value *V2 = TE.Scalars[I2];
7600 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7601 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7602 FirstUserOfPhi2->getParent());
7612 if (UserBVHead[I1] && !UserBVHead[I2])
7614 if (!UserBVHead[I1])
7616 if (UserBVHead[I1] == UserBVHead[I2])
7619 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7621 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7634 if (EE1->getOperand(0) == EE2->getOperand(0))
7636 if (!Inst1 && Inst2)
7638 if (Inst1 && Inst2) {
7646 "Expected either instructions or arguments vector operands.");
7647 return P1->getArgNo() < P2->getArgNo();
7652 std::iota(Phis.
begin(), Phis.
end(), 0);
7655 return std::nullopt;
7656 return std::move(Phis);
7658 if (TE.isGather() &&
7659 (!TE.hasState() || !TE.isAltShuffle() ||
7660 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7664 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7668 auto *EE = dyn_cast<ExtractElementInst>(V);
7669 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7675 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7676 if (Reuse || !CurrentOrder.
empty())
7677 return std::move(CurrentOrder);
7685 int Sz = TE.Scalars.size();
7689 if (It == TE.Scalars.begin())
7692 if (It != TE.Scalars.end()) {
7694 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7709 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7712 return std::move(Order);
7717 return std::nullopt;
7718 if (TE.Scalars.size() >= 3)
7723 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7725 StridedPtrInfo SPtrInfo;
7728 CurrentOrder, PointerOps, SPtrInfo);
7731 return std::move(CurrentOrder);
7736 if (std::optional<OrdersType> CurrentOrder =
7738 return CurrentOrder;
7740 return std::nullopt;
7750 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7752 if (Cluster != FirstCluster)
7758void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask)
const {
7761 const unsigned Sz =
TE.Scalars.size();
7763 if (!
TE.isGather() ||
7768 SmallVector<int> NewMask;
7770 addMask(NewMask,
TE.ReuseShuffleIndices);
7772 TE.ReorderIndices.clear();
7774 ArrayRef<int> Slice =
ArrayRef(NewMask).slice(0, Sz);
7775 SmallVector<unsigned> NewOrder(Slice);
7779 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
7780 *End =
TE.ReuseShuffleIndices.end();
7781 It != End; std::advance(It, Sz))
7782 std::iota(It, std::next(It, Sz), 0);
7788 "Expected same size of orders");
7789 size_t Sz = Order.
size();
7792 if (Order[Idx] != Sz)
7793 UsedIndices.
set(Order[Idx]);
7795 if (SecondaryOrder.
empty()) {
7797 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
7801 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7802 !UsedIndices.
test(SecondaryOrder[Idx]))
7803 Order[Idx] = SecondaryOrder[Idx];
7811 constexpr unsigned TinyVF = 2;
7812 constexpr unsigned TinyTree = 10;
7813 constexpr unsigned PhiOpsLimit = 12;
7814 constexpr unsigned GatherLoadsLimit = 2;
7815 if (VectorizableTree.size() <= TinyTree)
7817 if (VectorizableTree.front()->hasState() &&
7818 !VectorizableTree.front()->isGather() &&
7819 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7820 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7821 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7822 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7823 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7824 VectorizableTree.front()->ReorderIndices.empty()) {
7828 if (VectorizableTree.front()->hasState() &&
7829 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7830 VectorizableTree.front()->Scalars.size() == TinyVF &&
7831 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7834 if (VectorizableTree.front()->hasState() &&
7835 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7836 VectorizableTree.front()->ReorderIndices.empty()) {
7837 const unsigned ReorderedSplitsCnt =
7838 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7839 return TE->State == TreeEntry::SplitVectorize &&
7840 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7841 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7844 if (ReorderedSplitsCnt <= 1 &&
7846 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7847 return ((!TE->isGather() &&
7848 (TE->ReorderIndices.empty() ||
7849 (TE->UserTreeIndex.UserTE &&
7850 TE->UserTreeIndex.UserTE->State ==
7851 TreeEntry::Vectorize &&
7852 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7854 (TE->isGather() && TE->ReorderIndices.empty() &&
7855 (!TE->hasState() || TE->isAltShuffle() ||
7856 TE->getOpcode() == Instruction::Load ||
7857 TE->getOpcode() == Instruction::ZExt ||
7858 TE->getOpcode() == Instruction::SExt))) &&
7859 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7860 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
7861 return !isConstant(V) && isVectorized(V);
7863 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7866 bool HasPhis =
false;
7867 bool HasLoad =
true;
7868 unsigned GatherLoads = 0;
7869 for (
const std::unique_ptr<TreeEntry> &TE :
7870 ArrayRef(VectorizableTree).drop_front()) {
7871 if (TE->State == TreeEntry::SplitVectorize)
7873 if (!TE->hasState()) {
7877 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7882 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7883 if (!TE->isGather()) {
7890 if (GatherLoads >= GatherLoadsLimit)
7893 if (TE->getOpcode() == Instruction::GetElementPtr ||
7896 if (TE->getOpcode() != Instruction::PHI &&
7897 (!TE->hasCopyableElements() ||
7899 TE->Scalars.size() / 2))
7901 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7902 TE->getNumOperands() > PhiOpsLimit)
7911void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
7913 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
7916 std::iota(NewMask.
begin(), NewMask.
end(), 0);
7917 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7920 copy(MaskOrder, NewMaskOrder.begin());
7922 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
7923 unsigned Offset = CombinedEntriesWithIndices.
back().second;
7932 ReorderIndices.clear();
7951 ExternalUserReorderMap;
7955 for_each(VectorizableTree, [&, &TTIRef = *TTI](
7956 const std::unique_ptr<TreeEntry> &TE) {
7959 findExternalStoreUsersReorderIndices(TE.get());
7960 if (!ExternalUserReorderIndices.
empty()) {
7961 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7963 std::move(ExternalUserReorderIndices));
7969 if (TE->hasState() && TE->isAltShuffle() &&
7970 TE->State != TreeEntry::SplitVectorize) {
7971 Type *ScalarTy = TE->Scalars[0]->getType();
7973 unsigned Opcode0 = TE->getOpcode();
7974 unsigned Opcode1 = TE->getAltOpcode();
7978 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7979 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7985 bool IgnoreReorder =
7986 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7987 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
7988 VectorizableTree.front()->getOpcode() == Instruction::Store);
7989 if (std::optional<OrdersType> CurrentOrder =
7999 const TreeEntry *UserTE = TE.get();
8001 if (!UserTE->UserTreeIndex)
8003 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8004 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8005 UserTE->UserTreeIndex.UserTE->Idx != 0)
8007 UserTE = UserTE->UserTreeIndex.UserTE;
8010 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8011 if (!(TE->State == TreeEntry::Vectorize ||
8012 TE->State == TreeEntry::StridedVectorize ||
8013 TE->State == TreeEntry::SplitVectorize ||
8014 TE->State == TreeEntry::CompressVectorize) ||
8015 !TE->ReuseShuffleIndices.empty())
8016 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8017 if (TE->State == TreeEntry::Vectorize &&
8018 TE->getOpcode() == Instruction::PHI)
8019 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8024 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8025 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8026 auto It = VFToOrderedEntries.
find(VF);
8027 if (It == VFToOrderedEntries.
end())
8041 for (
const TreeEntry *OpTE : OrderedEntries) {
8044 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8045 OpTE->State != TreeEntry::SplitVectorize)
8048 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8050 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8051 auto It = GathersToOrders.find(OpTE);
8052 if (It != GathersToOrders.end())
8055 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8056 auto It = AltShufflesToOrders.find(OpTE);
8057 if (It != AltShufflesToOrders.end())
8060 if (OpTE->State == TreeEntry::Vectorize &&
8061 OpTE->getOpcode() == Instruction::PHI) {
8062 auto It = PhisToOrders.
find(OpTE);
8063 if (It != PhisToOrders.
end())
8066 return OpTE->ReorderIndices;
8069 auto It = ExternalUserReorderMap.
find(OpTE);
8070 if (It != ExternalUserReorderMap.
end()) {
8071 const auto &ExternalUserReorderIndices = It->second;
8075 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8076 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8077 ExternalUserReorderIndices.size();
8079 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8080 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8087 if (OpTE->State == TreeEntry::Vectorize &&
8088 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8089 assert(!OpTE->isAltShuffle() &&
8090 "Alternate instructions are only supported by BinaryOperator "
8094 unsigned E = Order.
size();
8097 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8100 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8102 ++OrdersUses.try_emplace(Order, 0).first->second;
8105 if (OrdersUses.empty())
8108 unsigned IdentityCnt = 0;
8109 unsigned FilledIdentityCnt = 0;
8111 for (
auto &Pair : OrdersUses) {
8113 if (!Pair.first.empty())
8114 FilledIdentityCnt += Pair.second;
8115 IdentityCnt += Pair.second;
8120 unsigned Cnt = IdentityCnt;
8121 for (
auto &Pair : OrdersUses) {
8125 if (Cnt < Pair.second ||
8126 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8127 Cnt == Pair.second && !BestOrder.
empty() &&
8130 BestOrder = Pair.first;
8143 unsigned E = BestOrder.
size();
8145 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8148 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8150 if (TE->Scalars.size() != VF) {
8151 if (TE->ReuseShuffleIndices.size() == VF) {
8152 assert(TE->State != TreeEntry::SplitVectorize &&
8153 "Split vectorized not expected.");
8158 (!TE->UserTreeIndex ||
8159 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8160 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8161 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8162 "All users must be of VF size.");
8169 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8175 reorderNodeWithReuses(*TE, Mask);
8177 if (TE->UserTreeIndex &&
8178 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8179 TE->UserTreeIndex.UserTE->reorderSplitNode(
8180 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8184 if ((TE->State == TreeEntry::SplitVectorize &&
8185 TE->ReuseShuffleIndices.empty()) ||
8186 ((TE->State == TreeEntry::Vectorize ||
8187 TE->State == TreeEntry::StridedVectorize ||
8188 TE->State == TreeEntry::CompressVectorize) &&
8193 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8194 TE->ReuseShuffleIndices.empty())) &&
8195 "Alternate instructions are only supported by BinaryOperator "
8201 TE->reorderOperands(Mask);
8204 TE->reorderOperands(Mask);
8205 assert(TE->ReorderIndices.empty() &&
8206 "Expected empty reorder sequence.");
8209 if (!TE->ReuseShuffleIndices.empty()) {
8216 addMask(NewReuses, TE->ReuseShuffleIndices);
8217 TE->ReuseShuffleIndices.swap(NewReuses);
8218 }
else if (TE->UserTreeIndex &&
8219 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8221 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8227void BoUpSLP::buildReorderableOperands(
8228 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8232 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8233 return OpData.first ==
I &&
8234 (OpData.second->State == TreeEntry::Vectorize ||
8235 OpData.second->State == TreeEntry::StridedVectorize ||
8236 OpData.second->State == TreeEntry::CompressVectorize ||
8237 OpData.second->State == TreeEntry::SplitVectorize);
8241 if (UserTE->hasState()) {
8242 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8243 UserTE->getOpcode() == Instruction::ExtractValue)
8245 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8247 if (UserTE->getOpcode() == Instruction::Store &&
8248 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8250 if (UserTE->getOpcode() == Instruction::Load &&
8251 (UserTE->State == TreeEntry::Vectorize ||
8252 UserTE->State == TreeEntry::StridedVectorize ||
8253 UserTE->State == TreeEntry::CompressVectorize))
8256 TreeEntry *TE = getOperandEntry(UserTE,
I);
8257 assert(TE &&
"Expected operand entry.");
8258 if (!TE->isGather()) {
8261 Edges.emplace_back(
I, TE);
8267 if (TE->State == TreeEntry::ScatterVectorize &&
8268 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8272 if (ReorderableGathers.
contains(TE))
8278 struct TreeEntryCompare {
8279 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8280 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8281 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8282 return LHS->Idx < RHS->Idx;
8291 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8292 if (TE->State != TreeEntry::Vectorize &&
8293 TE->State != TreeEntry::StridedVectorize &&
8294 TE->State != TreeEntry::CompressVectorize &&
8295 TE->State != TreeEntry::SplitVectorize)
8296 NonVectorized.
insert(TE.get());
8297 if (std::optional<OrdersType> CurrentOrder =
8299 Queue.push(TE.get());
8300 if (!(TE->State == TreeEntry::Vectorize ||
8301 TE->State == TreeEntry::StridedVectorize ||
8302 TE->State == TreeEntry::CompressVectorize ||
8303 TE->State == TreeEntry::SplitVectorize) ||
8304 !TE->ReuseShuffleIndices.empty())
8305 GathersToOrders.
insert(TE.get());
8314 while (!Queue.empty()) {
8316 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8317 TreeEntry *TE = Queue.top();
8318 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8321 while (!Queue.empty()) {
8323 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8328 for (TreeEntry *TE : OrderedOps) {
8329 if (!(TE->State == TreeEntry::Vectorize ||
8330 TE->State == TreeEntry::StridedVectorize ||
8331 TE->State == TreeEntry::CompressVectorize ||
8332 TE->State == TreeEntry::SplitVectorize ||
8333 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8334 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8335 !Visited.
insert(TE).second)
8339 Users.first = TE->UserTreeIndex.UserTE;
8340 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8344 if (
Data.first->State == TreeEntry::SplitVectorize) {
8346 Data.second.size() <= 2 &&
8347 "Expected not greater than 2 operands for split vectorize node.");
8349 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8352 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8353 "Expected exactly 2 entries.");
8354 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8355 TreeEntry &OpTE = *VectorizableTree[
P.first];
8357 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8358 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8360 const auto BestOrder =
8369 const unsigned E = Order.
size();
8372 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8374 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8376 if (!OpTE.ReorderIndices.empty()) {
8377 OpTE.ReorderIndices.clear();
8378 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8381 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8385 if (
Data.first->ReuseShuffleIndices.empty() &&
8386 !
Data.first->ReorderIndices.empty()) {
8389 Queue.push(
Data.first);
8395 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8407 for (
const auto &
Op :
Data.second) {
8408 TreeEntry *OpTE =
Op.second;
8409 if (!VisitedOps.
insert(OpTE).second)
8411 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8413 const auto Order = [&]() ->
const OrdersType {
8414 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8418 return OpTE->ReorderIndices;
8422 if (Order.
size() == 1)
8428 Value *Root = OpTE->hasState()
8431 auto GetSameNodesUsers = [&](
Value *Root) {
8433 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8434 if (TE != OpTE && TE->UserTreeIndex &&
8435 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8436 TE->Scalars.size() == OpTE->Scalars.size() &&
8437 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8438 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8439 Res.
insert(TE->UserTreeIndex.UserTE);
8441 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8442 if (TE != OpTE && TE->UserTreeIndex &&
8443 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8444 TE->Scalars.size() == OpTE->Scalars.size() &&
8445 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8446 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8447 Res.
insert(TE->UserTreeIndex.UserTE);
8451 auto GetNumOperands = [](
const TreeEntry *TE) {
8452 if (TE->State == TreeEntry::SplitVectorize)
8453 return TE->getNumOperands();
8455 return CI->arg_size();
8456 return TE->getNumOperands();
8458 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8459 const TreeEntry *TE) {
8467 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8468 if (
Op->isGather() &&
Op->hasState()) {
8469 const TreeEntry *VecOp =
8470 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8474 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8481 if (!RevisitedOps.
insert(UTE).second)
8483 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8484 !UTE->ReuseShuffleIndices.empty() ||
8485 (UTE->UserTreeIndex &&
8486 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8487 (
Data.first->UserTreeIndex &&
8488 Data.first->UserTreeIndex.UserTE == UTE) ||
8489 (IgnoreReorder && UTE->UserTreeIndex &&
8490 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8491 NodeShouldBeReorderedWithOperands(UTE);
8494 for (TreeEntry *UTE :
Users) {
8502 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8504 Queue.push(
const_cast<TreeEntry *
>(
Op));
8509 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8510 return P.second == OpTE;
8513 if (OpTE->State == TreeEntry::Vectorize &&
8514 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8515 assert(!OpTE->isAltShuffle() &&
8516 "Alternate instructions are only supported by BinaryOperator "
8520 unsigned E = Order.
size();
8523 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8526 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
8528 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
8530 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8531 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8532 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8533 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8534 (IgnoreReorder && TE->Idx == 0))
8536 if (TE->isGather()) {
8546 if (OpTE->UserTreeIndex) {
8547 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8548 if (!VisitedUsers.
insert(UserTE).second)
8553 if (AllowsReordering(UserTE))
8561 if (
static_cast<unsigned>(
count_if(
8562 Ops, [UserTE, &AllowsReordering](
8563 const std::pair<unsigned, TreeEntry *> &
Op) {
8564 return AllowsReordering(
Op.second) &&
8565 Op.second->UserTreeIndex.UserTE == UserTE;
8566 })) <=
Ops.size() / 2)
8567 ++Res.first->second;
8570 if (OrdersUses.empty()) {
8575 unsigned IdentityCnt = 0;
8576 unsigned VF =
Data.second.front().second->getVectorFactor();
8578 for (
auto &Pair : OrdersUses) {
8580 IdentityCnt += Pair.second;
8585 unsigned Cnt = IdentityCnt;
8586 for (
auto &Pair : OrdersUses) {
8590 if (Cnt < Pair.second) {
8592 BestOrder = Pair.first;
8609 unsigned E = BestOrder.
size();
8611 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8613 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8614 TreeEntry *TE =
Op.second;
8615 if (!VisitedOps.
insert(TE).second)
8617 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8618 reorderNodeWithReuses(*TE, Mask);
8622 if (TE->State != TreeEntry::Vectorize &&
8623 TE->State != TreeEntry::StridedVectorize &&
8624 TE->State != TreeEntry::CompressVectorize &&
8625 TE->State != TreeEntry::SplitVectorize &&
8626 (TE->State != TreeEntry::ScatterVectorize ||
8627 TE->ReorderIndices.empty()))
8629 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8630 TE->ReorderIndices.empty()) &&
8631 "Non-matching sizes of user/operand entries.");
8633 if (IgnoreReorder && TE == VectorizableTree.front().get())
8634 IgnoreReorder =
false;
8637 for (TreeEntry *
Gather : GatherOps) {
8639 "Unexpected reordering of gathers.");
8640 if (!
Gather->ReuseShuffleIndices.empty()) {
8650 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8651 return TE.isAltShuffle() &&
8652 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8653 TE.ReorderIndices.empty());
8655 if (
Data.first->State != TreeEntry::Vectorize ||
8657 Data.first->getMainOp()) ||
8658 IsNotProfitableAltCodeNode(*
Data.first))
8659 Data.first->reorderOperands(Mask);
8661 IsNotProfitableAltCodeNode(*
Data.first) ||
8662 Data.first->State == TreeEntry::StridedVectorize ||
8663 Data.first->State == TreeEntry::CompressVectorize) {
8667 if (
Data.first->ReuseShuffleIndices.empty() &&
8668 !
Data.first->ReorderIndices.empty() &&
8669 !IsNotProfitableAltCodeNode(*
Data.first)) {
8672 Queue.push(
Data.first);
8680 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8681 VectorizableTree.front()->ReuseShuffleIndices.empty())
8682 VectorizableTree.front()->ReorderIndices.
clear();
8685Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8686 if (Entry.hasState() &&
8687 (Entry.getOpcode() == Instruction::Store ||
8688 Entry.getOpcode() == Instruction::Load) &&
8689 Entry.State == TreeEntry::StridedVectorize &&
8690 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8697 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8701 for (
auto &TEPtr : VectorizableTree) {
8702 TreeEntry *Entry = TEPtr.get();
8705 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8709 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8710 Value *Scalar = Entry->Scalars[Lane];
8715 auto It = ScalarToExtUses.
find(Scalar);
8716 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8719 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8720 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8721 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8722 <<
" from " << *Scalar <<
"for many users.\n");
8723 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8724 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8725 ExternalUsesWithNonUsers.insert(Scalar);
8730 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8731 if (ExtI != ExternallyUsedValues.
end()) {
8732 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8733 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8734 << FoundLane <<
" from " << *Scalar <<
".\n");
8735 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8736 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8747 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8752 !UseEntries.
empty()) {
8756 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8759 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8760 return UseEntry->State == TreeEntry::ScatterVectorize ||
8762 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8765 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
8768 [](TreeEntry *UseEntry) {
8769 return UseEntry->isGather();
8775 if (It != ScalarToExtUses.
end()) {
8776 ExternalUses[It->second].User =
nullptr;
8781 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
8783 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8785 <<
" from lane " << FoundLane <<
" from " << *Scalar
8787 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8788 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8789 ExternalUsesWithNonUsers.insert(Scalar);
8798BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
8802 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
8803 Value *V = TE->Scalars[Lane];
8816 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
8825 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
8826 SI->getValueOperand()->getType(),
Ptr}];
8829 if (StoresVec.size() > Lane)
8831 if (!StoresVec.empty()) {
8833 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
8834 SI->getValueOperand()->getType(),
8835 StoresVec.front()->getPointerOperand(), *
DL, *SE,
8841 StoresVec.push_back(
SI);
8846 for (
auto &
P : PtrToStoresMap) {
8861 StoreInst *S0 = StoresVec[0];
8866 StoreInst *
SI = StoresVec[Idx];
8867 std::optional<int64_t> Diff =
8869 SI->getPointerOperand(), *DL, *SE,
8875 if (StoreOffsetVec.
size() != StoresVec.
size())
8877 sort(StoreOffsetVec, llvm::less_first());
8879 int64_t PrevDist = 0;
8880 for (
const auto &
P : StoreOffsetVec) {
8881 if (Idx > 0 &&
P.first != PrevDist + 1)
8889 ReorderIndices.assign(StoresVec.
size(), 0);
8890 bool IsIdentity =
true;
8892 ReorderIndices[
P.second] =
I;
8893 IsIdentity &=
P.second ==
I;
8899 ReorderIndices.clear();
8906 for (
unsigned Idx : Order)
8907 dbgs() << Idx <<
", ";
8913BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
8914 unsigned NumLanes =
TE->Scalars.size();
8927 if (StoresVec.
size() != NumLanes)
8932 if (!canFormVector(StoresVec, ReorderIndices))
8937 ExternalReorderIndices.
push_back(ReorderIndices);
8939 return ExternalReorderIndices;
8945 UserIgnoreList = &UserIgnoreLst;
8948 buildTreeRec(Roots, 0,
EdgeInfo());
8955 buildTreeRec(Roots, 0,
EdgeInfo());
8964 bool AddNew =
true) {
8972 for (
Value *V : VL) {
8976 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
8978 bool IsFound =
false;
8979 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
8980 assert(LI->getParent() ==
Data.front().first->getParent() &&
8981 LI->getType() ==
Data.front().first->getType() &&
8985 "Expected loads with the same type, same parent and same "
8986 "underlying pointer.");
8988 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
8989 Data.front().first->getPointerOperand(),
DL, SE,
8993 auto It = Map.find(*Dist);
8994 if (It != Map.end() && It->second != LI)
8996 if (It == Map.end()) {
8997 Data.emplace_back(LI, *Dist);
8998 Map.try_emplace(*Dist, LI);
9008 auto FindMatchingLoads =
9013 int64_t &
Offset,
unsigned &Start) {
9015 return GatheredLoads.
end();
9024 std::optional<int64_t> Dist =
9026 Data.front().first->getType(),
9027 Data.front().first->getPointerOperand(),
DL, SE,
9033 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9039 unsigned NumUniques = 0;
9040 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9041 bool Used = DataLoads.
contains(Pair.first);
9042 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9046 Repeated.insert(Cnt);
9049 if (NumUniques > 0 &&
9050 (Loads.
size() == NumUniques ||
9051 (Loads.
size() - NumUniques >= 2 &&
9052 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9058 return std::next(GatheredLoads.
begin(), Idx);
9062 return GatheredLoads.
end();
9064 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9068 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9070 while (It != GatheredLoads.
end()) {
9071 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9072 for (
unsigned Idx : LocalToAdd)
9075 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9079 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9086 Loads.push_back(
Data[Idx]);
9092 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9093 return PD.front().first->getParent() == LI->
getParent() &&
9094 PD.front().first->getType() == LI->
getType();
9096 while (It != GatheredLoads.
end()) {
9099 std::next(It), GatheredLoads.
end(),
9100 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9101 return PD.front().first->getParent() == LI->getParent() &&
9102 PD.front().first->getType() == LI->getType();
9106 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9107 AddNewLoads(GatheredLoads.emplace_back());
9112void BoUpSLP::tryToVectorizeGatheredLoads(
9113 const SmallMapVector<
9114 std::tuple<BasicBlock *, Value *, Type *>,
9117 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9120 LoadEntriesToVectorize.size());
9121 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9122 Set.insert_range(VectorizableTree[Idx]->Scalars);
9125 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9126 const std::pair<LoadInst *, int64_t> &L2) {
9127 return L1.second > L2.second;
9134 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9135 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9136 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9141 SmallVectorImpl<LoadInst *> &NonVectorized,
9142 bool Final,
unsigned MaxVF) {
9144 unsigned StartIdx = 0;
9145 SmallVector<int> CandidateVFs;
9149 *TTI, Loads.
front()->getType(), MaxVF);
9151 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9157 if (Final && CandidateVFs.
empty())
9160 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9161 for (
unsigned NumElts : CandidateVFs) {
9162 if (Final && NumElts > BestVF)
9164 SmallVector<unsigned> MaskedGatherVectorized;
9165 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9169 if (VectorizedLoads.count(Slice.
front()) ||
9170 VectorizedLoads.count(Slice.
back()) ||
9176 bool AllowToVectorize =
false;
9179 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9182 for (LoadInst *LI : Slice) {
9184 if (LI->hasOneUse())
9190 if (
static_cast<unsigned int>(std::distance(
9191 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9193 if (!IsLegalBroadcastLoad)
9197 for (User *U : LI->users()) {
9200 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9201 for (
int I :
seq<int>(UTE->getNumOperands())) {
9203 return V == LI || isa<PoisonValue>(V);
9213 AllowToVectorize = CheckIfAllowed(Slice);
9217 any_of(ValueToGatherNodes.at(Slice.front()),
9218 [=](
const TreeEntry *TE) {
9219 return TE->Scalars.size() == 2 &&
9220 ((TE->Scalars.front() == Slice.front() &&
9221 TE->Scalars.back() == Slice.back()) ||
9222 (TE->Scalars.front() == Slice.back() &&
9223 TE->Scalars.back() == Slice.front()));
9228 if (AllowToVectorize) {
9233 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9234 StridedPtrInfo SPtrInfo;
9236 PointerOps, SPtrInfo, &BestVF);
9238 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9240 if (MaskedGatherVectorized.
empty() ||
9241 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9246 Results.emplace_back(Values, LS);
9247 VectorizedLoads.insert_range(Slice);
9250 if (Cnt == StartIdx)
9251 StartIdx += NumElts;
9254 if (StartIdx >= Loads.
size())
9258 if (!MaskedGatherVectorized.
empty() &&
9259 Cnt < MaskedGatherVectorized.
back() + NumElts)
9265 if (!AllowToVectorize || BestVF == 0)
9269 for (
unsigned Cnt : MaskedGatherVectorized) {
9271 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9275 VectorizedLoads.insert_range(Slice);
9277 if (Cnt == StartIdx)
9278 StartIdx += NumElts;
9281 for (LoadInst *LI : Loads) {
9282 if (!VectorizedLoads.contains(LI))
9283 NonVectorized.push_back(LI);
9287 auto ProcessGatheredLoads =
9290 bool Final =
false) {
9292 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9294 if (LoadsDists.size() <= 1) {
9295 NonVectorized.
push_back(LoadsDists.back().first);
9303 unsigned MaxConsecutiveDistance = 0;
9304 unsigned CurrentConsecutiveDist = 1;
9305 int64_t LastDist = LocalLoadsDists.front().second;
9306 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9307 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9310 assert(LastDist >=
L.second &&
9311 "Expected first distance always not less than second");
9312 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9313 CurrentConsecutiveDist) {
9314 ++CurrentConsecutiveDist;
9315 MaxConsecutiveDistance =
9316 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9320 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9323 CurrentConsecutiveDist = 1;
9324 LastDist =
L.second;
9327 if (Loads.
size() <= 1)
9329 if (AllowMaskedGather)
9330 MaxConsecutiveDistance = Loads.
size();
9331 else if (MaxConsecutiveDistance < 2)
9336 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9337 Final, MaxConsecutiveDistance);
9339 OriginalLoads.size() == Loads.
size() &&
9340 MaxConsecutiveDistance == Loads.
size() &&
9345 VectorizedLoads.
clear();
9349 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9350 UnsortedNonVectorized, Final,
9351 OriginalLoads.size());
9352 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9353 SortedNonVectorized.
swap(UnsortedNonVectorized);
9354 Results.swap(UnsortedResults);
9359 << Slice.
size() <<
")\n");
9361 for (
Value *L : Slice)
9369 unsigned MaxVF = Slice.size();
9370 unsigned UserMaxVF = 0;
9371 unsigned InterleaveFactor = 0;
9376 std::optional<unsigned> InterleavedLoadsDistance = 0;
9378 std::optional<unsigned> CommonVF = 0;
9379 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9380 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9381 for (
auto [Idx, V] :
enumerate(Slice)) {
9382 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9383 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9386 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9388 if (*CommonVF == 0) {
9389 CommonVF =
E->Scalars.size();
9392 if (*CommonVF !=
E->Scalars.size())
9396 if (Pos != Idx && InterleavedLoadsDistance) {
9399 if (isa<Constant>(V))
9401 if (isVectorized(V))
9403 const auto &Nodes = ValueToGatherNodes.at(V);
9404 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9405 !is_contained(Slice, V);
9407 InterleavedLoadsDistance.reset();
9411 if (*InterleavedLoadsDistance == 0) {
9412 InterleavedLoadsDistance = Idx - Pos;
9415 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9416 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9417 InterleavedLoadsDistance.reset();
9418 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9422 DeinterleavedNodes.
clear();
9424 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9425 CommonVF.value_or(0) != 0) {
9426 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9427 unsigned VF = *CommonVF;
9430 StridedPtrInfo SPtrInfo;
9432 if (InterleaveFactor <= Slice.size() &&
9433 TTI.isLegalInterleavedAccessType(
9441 UserMaxVF = InterleaveFactor * VF;
9443 InterleaveFactor = 0;
9448 unsigned ConsecutiveNodesSize = 0;
9449 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9450 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9451 [&, Slice = Slice](
const auto &
P) {
9453 return std::get<1>(
P).contains(V);
9455 if (It == Slice.end())
9457 const TreeEntry &
TE =
9458 *VectorizableTree[std::get<0>(
P)];
9462 StridedPtrInfo SPtrInfo;
9464 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9468 ConsecutiveNodesSize += VL.
size();
9469 size_t Start = std::distance(Slice.begin(), It);
9470 size_t Sz = Slice.size() -
Start;
9471 return Sz < VL.
size() ||
9472 Slice.slice(Start, VL.
size()) != VL;
9477 if (InterleaveFactor == 0 &&
9479 [&, Slice = Slice](
unsigned Idx) {
9481 SmallVector<Value *> PointerOps;
9482 StridedPtrInfo SPtrInfo;
9483 return canVectorizeLoads(
9484 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9485 Slice[Idx * UserMaxVF], Order, PointerOps,
9486 SPtrInfo) == LoadsState::ScatterVectorize;
9489 if (Slice.size() != ConsecutiveNodesSize)
9490 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9492 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9493 bool IsVectorized =
true;
9494 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
9496 Slice.slice(
I, std::min(VF,
E -
I));
9501 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9502 [&](
const auto &
P) {
9504 VectorizableTree[std::get<0>(
P)]
9509 unsigned Sz = VectorizableTree.size();
9510 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
9511 if (Sz == VectorizableTree.size()) {
9512 IsVectorized =
false;
9515 if (InterleaveFactor > 0) {
9516 VF = 2 * (MaxVF / InterleaveFactor);
9517 InterleaveFactor = 0;
9526 NonVectorized.
append(SortedNonVectorized);
9528 return NonVectorized;
9530 for (
const auto &GLs : GatheredLoads) {
9531 const auto &
Ref = GLs.second;
9533 if (!
Ref.empty() && !NonVectorized.
empty() &&
9535 Ref.begin(),
Ref.end(), 0u,
9536 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9537 ->
unsigned { return S + LoadsDists.size(); }) !=
9538 NonVectorized.
size() &&
9539 IsMaskedGatherSupported(NonVectorized)) {
9542 for (LoadInst *LI : NonVectorized) {
9550 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9554 for (
unsigned Idx : LoadEntriesToVectorize) {
9555 const TreeEntry &
E = *VectorizableTree[Idx];
9558 if (!
E.ReorderIndices.empty()) {
9561 SmallVector<int> ReorderMask;
9565 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
9569 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9570 VectorizableTree.size())
9571 GatheredLoadsEntriesFirst.reset();
9581 bool AllowAlternate) {
9604 isValidForAlternation(
I->getOpcode())) {
9616 std::pair<size_t, size_t> OpVals =
9624 if (CI->isCommutative())
9646 SubKey =
hash_value(Gep->getPointerOperand());
9658 return std::make_pair(
Key, SubKey);
9664 Instruction *AltOp,
const TargetLibraryInfo &TLI);
9666bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9668 Type *ScalarTy = S.getMainOp()->getType();
9669 unsigned Opcode0 = S.getOpcode();
9670 unsigned Opcode1 = S.getAltOpcode();
9671 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9674 Opcode1, OpcodeMask))
9677 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
9680 for (
Value *V : VL) {
9697 switch (Res.value_or(0)) {
9711 DenseSet<unsigned> UniqueOpcodes;
9712 constexpr unsigned NumAltInsts = 3;
9713 unsigned NonInstCnt = 0;
9716 unsigned UndefCnt = 0;
9718 unsigned ExtraShuffleInsts = 0;
9727 return is_contained(Operands.back(), V);
9730 ++ExtraShuffleInsts;
9733 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
9745 DenseMap<Value *, unsigned> Uniques;
9755 if (!Res.second && Res.first->second == 1)
9756 ++ExtraShuffleInsts;
9757 ++Res.first->getSecond();
9759 UniqueOpcodes.
insert(
I->getOpcode());
9760 else if (Res.second)
9763 return none_of(Uniques, [&](
const auto &
P) {
9764 return P.first->hasNUsesOrMore(
P.second + 1) &&
9765 none_of(
P.first->users(), [&](User *U) {
9766 return isVectorized(U) || Uniques.contains(U);
9775 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9776 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
9777 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9784 const unsigned VF,
unsigned MinBW,
9807static std::pair<InstructionCost, InstructionCost>
9827 FMF = FPCI->getFastMathFlags();
9830 LibCost.isValid() ? LibCost : ScalarLimit);
9840BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9842 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
9843 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9845 "Expected instructions with same/alternate opcodes only.");
9847 unsigned ShuffleOrOp =
9848 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
9850 switch (ShuffleOrOp) {
9851 case Instruction::PHI: {
9854 return TreeEntry::NeedToGather;
9856 for (
Value *V : VL) {
9860 for (
Value *Incoming :
PHI->incoming_values()) {
9862 if (Term &&
Term->isTerminator()) {
9864 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
9865 return TreeEntry::NeedToGather;
9870 return TreeEntry::Vectorize;
9872 case Instruction::ExtractElement:
9879 return TreeEntry::NeedToGather;
9881 case Instruction::ExtractValue: {
9882 bool Reuse = canReuseExtract(VL, CurrentOrder);
9886 return TreeEntry::NeedToGather;
9887 if (Reuse || !CurrentOrder.empty())
9888 return TreeEntry::Vectorize;
9890 return TreeEntry::NeedToGather;
9892 case Instruction::InsertElement: {
9896 for (
Value *V : VL) {
9898 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
9899 return TreeEntry::NeedToGather;
9903 "Non-constant or undef index?");
9907 return !SourceVectors.contains(V);
9910 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9911 "different source vectors.\n");
9912 return TreeEntry::NeedToGather;
9917 return SourceVectors.contains(V) && !
V->hasOneUse();
9920 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9921 "multiple uses.\n");
9922 return TreeEntry::NeedToGather;
9925 return TreeEntry::Vectorize;
9927 case Instruction::Load: {
9934 auto IsGatheredNode = [&]() {
9935 if (!GatheredLoadsEntriesFirst)
9940 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
9941 return TE->Idx >= *GatheredLoadsEntriesFirst;
9947 return TreeEntry::Vectorize;
9949 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9951 LoadEntriesToVectorize.insert(VectorizableTree.size());
9952 return TreeEntry::NeedToGather;
9954 return IsGatheredNode() ? TreeEntry::NeedToGather
9955 : TreeEntry::CompressVectorize;
9957 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9959 LoadEntriesToVectorize.insert(VectorizableTree.size());
9960 return TreeEntry::NeedToGather;
9962 return IsGatheredNode() ? TreeEntry::NeedToGather
9963 : TreeEntry::ScatterVectorize;
9965 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9967 LoadEntriesToVectorize.insert(VectorizableTree.size());
9968 return TreeEntry::NeedToGather;
9970 return IsGatheredNode() ? TreeEntry::NeedToGather
9971 : TreeEntry::StridedVectorize;
9975 if (DL->getTypeSizeInBits(ScalarTy) !=
9976 DL->getTypeAllocSizeInBits(ScalarTy))
9977 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
9980 return !LI || !LI->isSimple();
9987 return TreeEntry::NeedToGather;
9991 case Instruction::ZExt:
9992 case Instruction::SExt:
9993 case Instruction::FPToUI:
9994 case Instruction::FPToSI:
9995 case Instruction::FPExt:
9996 case Instruction::PtrToInt:
9997 case Instruction::IntToPtr:
9998 case Instruction::SIToFP:
9999 case Instruction::UIToFP:
10000 case Instruction::Trunc:
10001 case Instruction::FPTrunc:
10002 case Instruction::BitCast: {
10004 for (
Value *V : VL) {
10010 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10011 return TreeEntry::NeedToGather;
10014 return TreeEntry::Vectorize;
10016 case Instruction::ICmp:
10017 case Instruction::FCmp: {
10022 for (
Value *V : VL) {
10026 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10027 Cmp->getOperand(0)->getType() != ComparedTy) {
10028 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10029 return TreeEntry::NeedToGather;
10032 return TreeEntry::Vectorize;
10034 case Instruction::Select:
10035 case Instruction::FNeg:
10036 case Instruction::Add:
10037 case Instruction::FAdd:
10038 case Instruction::Sub:
10039 case Instruction::FSub:
10040 case Instruction::Mul:
10041 case Instruction::FMul:
10042 case Instruction::UDiv:
10043 case Instruction::SDiv:
10044 case Instruction::FDiv:
10045 case Instruction::URem:
10046 case Instruction::SRem:
10047 case Instruction::FRem:
10048 case Instruction::Shl:
10049 case Instruction::LShr:
10050 case Instruction::AShr:
10051 case Instruction::And:
10052 case Instruction::Or:
10053 case Instruction::Xor:
10054 case Instruction::Freeze:
10055 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10056 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10058 return I &&
I->isBinaryOp() && !
I->isFast();
10060 return TreeEntry::NeedToGather;
10061 return TreeEntry::Vectorize;
10062 case Instruction::GetElementPtr: {
10064 for (
Value *V : VL) {
10068 if (
I->getNumOperands() != 2) {
10069 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10070 return TreeEntry::NeedToGather;
10077 for (
Value *V : VL) {
10081 Type *CurTy =
GEP->getSourceElementType();
10082 if (Ty0 != CurTy) {
10083 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10084 return TreeEntry::NeedToGather;
10090 for (
Value *V : VL) {
10094 auto *
Op =
I->getOperand(1);
10096 (
Op->getType() != Ty1 &&
10098 Op->getType()->getScalarSizeInBits() >
10099 DL->getIndexSizeInBits(
10100 V->getType()->getPointerAddressSpace())))) {
10102 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10103 return TreeEntry::NeedToGather;
10107 return TreeEntry::Vectorize;
10109 case Instruction::Store: {
10111 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10114 if (DL->getTypeSizeInBits(ScalarTy) !=
10115 DL->getTypeAllocSizeInBits(ScalarTy)) {
10116 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10117 return TreeEntry::NeedToGather;
10121 for (
Value *V : VL) {
10123 if (!
SI->isSimple()) {
10125 return TreeEntry::NeedToGather;
10134 if (CurrentOrder.empty()) {
10135 Ptr0 = PointerOps.
front();
10136 PtrN = PointerOps.
back();
10138 Ptr0 = PointerOps[CurrentOrder.front()];
10139 PtrN = PointerOps[CurrentOrder.back()];
10141 std::optional<int64_t> Dist =
10144 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10145 return TreeEntry::Vectorize;
10149 return TreeEntry::NeedToGather;
10151 case Instruction::Call: {
10152 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10153 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10155 return I && !
I->isFast();
10157 return TreeEntry::NeedToGather;
10167 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10171 return TreeEntry::NeedToGather;
10174 unsigned NumArgs = CI->
arg_size();
10176 for (
unsigned J = 0; J != NumArgs; ++J)
10179 for (
Value *V : VL) {
10184 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10186 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10188 return TreeEntry::NeedToGather;
10192 for (
unsigned J = 0; J != NumArgs; ++J) {
10195 if (ScalarArgs[J] != A1J) {
10197 <<
"SLP: mismatched arguments in call:" << *CI
10198 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10199 return TreeEntry::NeedToGather;
10208 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10209 <<
"!=" << *V <<
'\n');
10210 return TreeEntry::NeedToGather;
10215 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10217 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10218 return TreeEntry::NeedToGather;
10220 return TreeEntry::Vectorize;
10222 case Instruction::ShuffleVector: {
10223 if (!S.isAltShuffle()) {
10226 return TreeEntry::Vectorize;
10229 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10230 return TreeEntry::NeedToGather;
10235 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10236 "the whole alt sequence is not profitable.\n");
10237 return TreeEntry::NeedToGather;
10240 return TreeEntry::Vectorize;
10244 return TreeEntry::NeedToGather;
10253 PHINode *Main =
nullptr;
10258 PHIHandler() =
delete;
10260 : DT(DT), Main(Main), Phis(Phis),
10261 Operands(Main->getNumIncomingValues(),
10263 void buildOperands() {
10264 constexpr unsigned FastLimit = 4;
10273 for (
auto [Idx, V] :
enumerate(Phis)) {
10277 "Expected isa instruction or poison value.");
10281 if (
P->getIncomingBlock(
I) == InBB)
10284 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10289 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10299 for (
auto [Idx, V] :
enumerate(Phis)) {
10314 auto *It = Blocks.
find(InBB);
10315 if (It == Blocks.
end())
10317 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10320 for (
const auto &
P : Blocks) {
10321 ArrayRef<unsigned> IncomingValues =
P.second;
10322 if (IncomingValues.
size() <= 1)
10325 for (
unsigned I : IncomingValues) {
10327 [&](
const auto &
Data) {
10328 return !
Data.value() ||
10331 "Expected empty operands list.");
10345static std::pair<Instruction *, Instruction *>
10349 for (
Value *V : VL) {
10359 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10378 "Expected different main and alt instructions.");
10379 return std::make_pair(MainOp, AltOp);
10392 const InstructionsState &S,
10394 bool TryPad =
false) {
10398 for (
Value *V : VL) {
10414 size_t NumUniqueScalarValues = UniqueValues.
size();
10417 if (NumUniqueScalarValues == VL.
size() &&
10419 ReuseShuffleIndices.
clear();
10424 if ((UserTreeIdx.
UserTE &&
10425 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10427 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10428 "for nodes with padding.\n");
10429 ReuseShuffleIndices.
clear();
10434 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10438 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10439 S.getMainOp()->isSafeToRemove() &&
10440 (S.areInstructionsWithCopyableElements() ||
10444 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10445 PWSz = std::min<unsigned>(PWSz, VL.
size());
10446 if (PWSz == VL.
size()) {
10450 ReuseShuffleIndices.
clear();
10454 UniqueValues.
end());
10455 PaddedUniqueValues.
append(
10456 PWSz - UniqueValues.
size(),
10460 if (!S.areInstructionsWithCopyableElements() &&
10463 ReuseShuffleIndices.
clear();
10466 VL = std::move(PaddedUniqueValues);
10471 ReuseShuffleIndices.
clear();
10474 VL = std::move(UniqueValues);
10479 const InstructionsState &LocalState,
10480 SmallVectorImpl<Value *> &Op1,
10481 SmallVectorImpl<Value *> &Op2,
10483 constexpr unsigned SmallNodeSize = 4;
10484 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10489 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10491 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
10492 if (
E->isSame(VL)) {
10494 << *LocalState.getMainOp() <<
".\n");
10506 ReorderIndices.assign(VL.
size(), VL.
size());
10507 SmallBitVector Op1Indices(VL.
size());
10512 Op1Indices.set(Idx);
10515 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10518 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10520 LocalState.getAltOp(), *TLI))) {
10522 Op1Indices.set(Idx);
10529 unsigned Opcode0 = LocalState.getOpcode();
10530 unsigned Opcode1 = LocalState.getAltOpcode();
10531 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10536 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10537 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10542 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10544 if (Op1Indices.test(Idx)) {
10545 ReorderIndices[Op1Cnt] = Idx;
10548 ReorderIndices[Op2Cnt] = Idx;
10553 ReorderIndices.clear();
10554 SmallVector<int>
Mask;
10555 if (!ReorderIndices.empty())
10557 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10562 if (NumParts >= VL.
size())
10567 FixedVectorType *SubVecTy =
10571 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10572 (
Mask.empty() || InsertCost >= NewShuffleCost))
10574 if ((LocalState.getMainOp()->isBinaryOp() &&
10575 LocalState.getAltOp()->isBinaryOp() &&
10576 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10577 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10578 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10579 (LocalState.getMainOp()->isUnaryOp() &&
10580 LocalState.getAltOp()->isUnaryOp())) {
10582 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10583 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10588 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
10592 VecTy, OriginalMask, Kind);
10594 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10595 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10597 NewVecOpsCost + InsertCost +
10598 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10599 VectorizableTree.front()->getOpcode() == Instruction::Store
10603 if (NewCost >= OriginalCost)
10613class InstructionsCompatibilityAnalysis {
10615 const DataLayout &
DL;
10616 const TargetTransformInfo &
TTI;
10617 const TargetLibraryInfo &TLI;
10618 unsigned MainOpcode = 0;
10623 static bool isSupportedOpcode(
const unsigned Opcode) {
10624 return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10635 return I && isSupportedOpcode(
I->getOpcode()) &&
10640 SmallDenseSet<Value *, 8>
Operands;
10641 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10642 for (
Value *V : VL) {
10648 if (Candidates.
empty()) {
10649 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10651 Operands.insert(
I->op_begin(),
I->op_end());
10654 if (Parent ==
I->getParent()) {
10655 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10656 Operands.insert(
I->op_begin(),
I->op_end());
10659 auto *NodeA = DT.
getNode(Parent);
10660 auto *NodeB = DT.
getNode(
I->getParent());
10661 assert(NodeA &&
"Should only process reachable instructions");
10662 assert(NodeB &&
"Should only process reachable instructions");
10663 assert((NodeA == NodeB) ==
10664 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10665 "Different nodes should have different DFS numbers");
10666 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10667 Candidates.
clear();
10668 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10671 Operands.insert(
I->op_begin(),
I->op_end());
10674 unsigned BestOpcodeNum = 0;
10676 for (
const auto &
P : Candidates) {
10677 if (
P.second.size() < BestOpcodeNum)
10679 for (Instruction *
I :
P.second) {
10680 if (IsSupportedInstruction(
I) && !
Operands.contains(
I)) {
10682 BestOpcodeNum =
P.second.size();
10692 return I &&
I->getParent() == MainOp->
getParent() &&
10705 Value *selectBestIdempotentValue()
const {
10706 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10717 if (!S.isCopyableElement(V))
10719 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10720 return {
V, selectBestIdempotentValue()};
10726 SmallVectorImpl<BoUpSLP::ValueList> &
Operands)
const {
10728 unsigned ShuffleOrOp =
10729 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10732 switch (ShuffleOrOp) {
10733 case Instruction::PHI: {
10737 PHIHandler Handler(DT, PH, VL);
10738 Handler.buildOperands();
10739 Operands.assign(PH->getNumOperands(), {});
10741 Operands[
I].assign(Handler.getOperands(
I).begin(),
10742 Handler.getOperands(
I).end());
10745 case Instruction::ExtractValue:
10746 case Instruction::ExtractElement:
10751 case Instruction::InsertElement:
10759 case Instruction::Load:
10767 Op = LI->getPointerOperand();
10770 case Instruction::ZExt:
10771 case Instruction::SExt:
10772 case Instruction::FPToUI:
10773 case Instruction::FPToSI:
10774 case Instruction::FPExt:
10775 case Instruction::PtrToInt:
10776 case Instruction::IntToPtr:
10777 case Instruction::SIToFP:
10778 case Instruction::UIToFP:
10779 case Instruction::Trunc:
10780 case Instruction::FPTrunc:
10781 case Instruction::BitCast:
10782 case Instruction::ICmp:
10783 case Instruction::FCmp:
10784 case Instruction::Select:
10785 case Instruction::FNeg:
10786 case Instruction::Add:
10787 case Instruction::FAdd:
10788 case Instruction::Sub:
10789 case Instruction::FSub:
10790 case Instruction::Mul:
10791 case Instruction::FMul:
10792 case Instruction::UDiv:
10793 case Instruction::SDiv:
10794 case Instruction::FDiv:
10795 case Instruction::URem:
10796 case Instruction::SRem:
10797 case Instruction::FRem:
10798 case Instruction::Shl:
10799 case Instruction::LShr:
10800 case Instruction::AShr:
10801 case Instruction::And:
10802 case Instruction::Or:
10803 case Instruction::Xor:
10804 case Instruction::Freeze:
10805 case Instruction::Store:
10806 case Instruction::ShuffleVector:
10815 auto [
Op, ConvertedOps] = convertTo(
I, S);
10820 case Instruction::GetElementPtr: {
10827 const unsigned IndexIdx = 1;
10833 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
10836 :
DL.getIndexType(
cast<GetElementPtrInst>(VL0)
10837 ->getPointerOperandType()
10838 ->getScalarType());
10843 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10847 auto *
Op =
GEP->getOperand(IndexIdx);
10850 CI, Ty, CI->getValue().isSignBitSet(),
DL)
10855 case Instruction::Call: {
10862 for (
Value *V : VL) {
10864 Ops.push_back(
I ?
I->getOperand(Idx)
10877 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
10878 const TargetTransformInfo &
TTI,
10879 const TargetLibraryInfo &TLI)
10884 bool TryCopyableElementsVectorization,
10885 bool WithProfitabilityCheck =
false,
10886 bool SkipSameCodeCheck =
false) {
10887 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
10888 ? InstructionsState::invalid()
10894 findAndSetMainInstruction(VL, R);
10896 return InstructionsState::invalid();
10897 S = InstructionsState(MainOp, MainOp,
true);
10898 if (!WithProfitabilityCheck)
10902 auto BuildCandidates =
10903 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
10909 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
10910 I1->getParent() != I2->getParent())
10914 if (VL.
size() == 2) {
10919 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10920 R.findBestRootPair(Candidates1) &&
10921 R.findBestRootPair(Candidates2);
10923 Candidates1.
clear();
10924 Candidates2.
clear();
10927 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10928 R.findBestRootPair(Candidates1) &&
10929 R.findBestRootPair(Candidates2);
10932 return InstructionsState::invalid();
10936 FixedVectorType *VecTy =
10938 switch (MainOpcode) {
10939 case Instruction::Add:
10940 case Instruction::LShr:
10946 if (VectorCost > ScalarCost)
10947 return InstructionsState::invalid();
10950 assert(
Operands.size() == 2 &&
"Unexpected number of operands!");
10951 unsigned CopyableNum =
10952 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
10953 if (CopyableNum < VL.
size() / 2)
10956 const unsigned Limit = VL.
size() / 24;
10957 if ((CopyableNum >= VL.
size() - Limit ||
10958 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
10963 return InstructionsState::invalid();
10982 return InstructionsState::invalid();
10988 constexpr unsigned Limit = 4;
10989 if (
Operands.front().size() >= Limit) {
10990 SmallDenseMap<const Value *, unsigned>
Counters;
10998 return C.second == 1;
11004 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11005 InstructionsState OpS =
Analysis.buildInstructionsState(
11007 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11009 unsigned CopyableNum =
11011 return CopyableNum <= VL.
size() / 2;
11013 if (!CheckOperand(
Operands.front()))
11014 return InstructionsState::invalid();
11021 assert(S &&
"Invalid state!");
11023 if (S.areInstructionsWithCopyableElements()) {
11024 MainOp = S.getMainOp();
11025 MainOpcode = S.getOpcode();
11030 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11031 Operands[OperandIdx][Idx] = Operand;
11034 buildOriginalOperands(S, VL,
Operands);
11041BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11043 bool TryCopyableElementsVectorization)
const {
11046 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11047 InstructionsState S =
Analysis.buildInstructionsState(
11048 VL, *
this, TryCopyableElementsVectorization,
11049 true, TryCopyableElementsVectorization);
11057 return ScalarsVectorizationLegality(S,
false,
11063 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11064 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11065 if (
E->isSame(VL)) {
11066 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11068 return ScalarsVectorizationLegality(S,
false);
11073 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11074 LI->getLoopFor(S.getMainOp()->getParent()) &&
11078 return ScalarsVectorizationLegality(S,
false);
11087 !(S && !S.isAltShuffle() && VL.
size() >= 4 &&
11094 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11095 return ScalarsVectorizationLegality(S,
false);
11099 if (S && S.getOpcode() == Instruction::ExtractElement &&
11102 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11103 return ScalarsVectorizationLegality(S,
false);
11110 return ScalarsVectorizationLegality(S,
false,
11120 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11128 SmallVector<unsigned, 8> InstsCount;
11129 for (
Value *V : VL) {
11132 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11135 bool IsCommutative =
11137 if ((IsCommutative &&
11138 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11140 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11142 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11146 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11148 I2->getOperand(
Op));
11149 if (
static_cast<unsigned>(
count_if(
11150 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11152 })) >= S.getMainOp()->getNumOperands() / 2)
11154 if (S.getMainOp()->getNumOperands() > 2)
11156 if (IsCommutative) {
11158 Candidates.
clear();
11159 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11161 I2->getOperand((
Op + 1) %
E));
11163 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11170 SmallVector<unsigned> SortedIndices;
11172 bool IsScatterVectorizeUserTE =
11173 UserTreeIdx.UserTE &&
11174 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11175 bool AreAllSameBlock = S.valid();
11176 bool AreScatterAllGEPSameBlock =
11189 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11191 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11197 NotProfitableForVectorization(VL)) {
11199 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11200 "C,S,B,O, small shuffle. \n";
11204 return ScalarsVectorizationLegality(S,
false,
11208 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11212 return ScalarsVectorizationLegality(S,
false);
11216 if (S && !EphValues.empty()) {
11217 for (
Value *V : VL) {
11218 if (EphValues.count(V)) {
11220 <<
") is ephemeral.\n");
11222 return ScalarsVectorizationLegality(S,
false,
11234 if (S && S.isAltShuffle()) {
11235 auto GetNumVectorizedExtracted = [&]() {
11241 all_of(
I->operands(), [&](
const Use &U) {
11242 return isa<ExtractElementInst>(U.get());
11247 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11250 return std::make_pair(Vectorized, Extracted);
11252 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11254 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11255 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11258 Type *ScalarTy = VL.front()->getType();
11263 false,
true, Kind);
11265 *TTI, ScalarTy, VecTy, Vectorized,
11266 true,
false, Kind,
false);
11267 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11269 if (PreferScalarize) {
11270 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11271 "node is not profitable.\n");
11272 return ScalarsVectorizationLegality(S,
false);
11277 if (UserIgnoreList && !UserIgnoreList->empty()) {
11278 for (
Value *V : VL) {
11279 if (UserIgnoreList->contains(V)) {
11280 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11281 return ScalarsVectorizationLegality(S,
false);
11288 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11289 assert(VL.front()->getType()->isPointerTy() &&
11291 "Expected pointers only.");
11294 assert(It != VL.end() &&
"Expected at least one GEP.");
11305 !DT->isReachableFromEntry(BB))) {
11311 return ScalarsVectorizationLegality(S,
false);
11313 return ScalarsVectorizationLegality(S,
true);
11318 unsigned InterleaveFactor) {
11321 SmallVector<int> ReuseShuffleIndices;
11325 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11328 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11331 auto Invalid = ScheduleBundle::invalid();
11332 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11333 UserTreeIdx, {}, ReorderIndices);
11338 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11340 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11341 Idx == 0 ? 0 : Op1.
size());
11342 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11344 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11345 Idx == 0 ? 0 : Op1.
size());
11355 bool AreConsts =
false;
11356 for (
Value *V : VL) {
11368 if (AreOnlyConstsWithPHIs(VL)) {
11369 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11370 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11374 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11375 VL,
Depth, UserTreeIdx,
false);
11376 InstructionsState S = Legality.getInstructionsState();
11377 if (!Legality.isLegal()) {
11378 if (Legality.trySplitVectorize()) {
11381 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11385 Legality = getScalarsVectorizationLegality(
11386 VL,
Depth, UserTreeIdx,
true);
11387 if (!Legality.isLegal()) {
11388 if (Legality.tryToFindDuplicates())
11392 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11395 S = Legality.getInstructionsState();
11399 if (S.isAltShuffle() && TrySplitNode(S))
11405 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11410 bool IsScatterVectorizeUserTE =
11411 UserTreeIdx.UserTE &&
11412 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11415 StridedPtrInfo SPtrInfo;
11416 TreeEntry::EntryState State = getScalarsVectorizationState(
11417 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11418 if (State == TreeEntry::NeedToGather) {
11419 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11425 auto &BSRef = BlocksSchedules[BB];
11427 BSRef = std::make_unique<BlockScheduling>(BB);
11429 BlockScheduling &BS = *BSRef;
11432 std::optional<ScheduleBundle *> BundlePtr =
11433 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11434#ifdef EXPENSIVE_CHECKS
11438 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11439 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11441 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11443 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11444 NonScheduledFirst.insert(VL.front());
11445 if (S.getOpcode() == Instruction::Load &&
11446 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11450 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11452 ScheduleBundle
Empty;
11453 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11454 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11456 unsigned ShuffleOrOp =
11457 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11458 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
11460 SmallVector<unsigned> PHIOps;
11466 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11471 for (
unsigned I : PHIOps)
11474 switch (ShuffleOrOp) {
11475 case Instruction::PHI: {
11477 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11485 case Instruction::ExtractValue:
11486 case Instruction::ExtractElement: {
11487 if (CurrentOrder.empty()) {
11488 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11491 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11493 for (
unsigned Idx : CurrentOrder)
11494 dbgs() <<
" " << Idx;
11501 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11502 ReuseShuffleIndices, CurrentOrder);
11504 "(ExtractValueInst/ExtractElementInst).\n";
11511 case Instruction::InsertElement: {
11512 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11514 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11515 const std::pair<int, int> &P2) {
11516 return P1.first > P2.first;
11519 decltype(OrdCompare)>
11520 Indices(OrdCompare);
11521 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11523 Indices.emplace(Idx,
I);
11525 OrdersType CurrentOrder(VL.size(), VL.size());
11526 bool IsIdentity =
true;
11527 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11528 CurrentOrder[Indices.top().second] =
I;
11529 IsIdentity &= Indices.top().second ==
I;
11533 CurrentOrder.clear();
11534 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11536 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11540 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11543 case Instruction::Load: {
11550 TreeEntry *
TE =
nullptr;
11553 case TreeEntry::Vectorize:
11554 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11555 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11556 if (CurrentOrder.empty())
11557 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11561 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11564 case TreeEntry::CompressVectorize:
11566 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11567 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11570 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11573 case TreeEntry::StridedVectorize:
11575 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11576 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11577 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
11578 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11581 case TreeEntry::ScatterVectorize:
11583 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11584 UserTreeIdx, ReuseShuffleIndices);
11587 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11590 case TreeEntry::CombinedVectorize:
11591 case TreeEntry::SplitVectorize:
11592 case TreeEntry::NeedToGather:
11595 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11596 assert(
Operands.size() == 1 &&
"Expected a single operand only");
11597 SmallVector<int>
Mask;
11602 if (State == TreeEntry::ScatterVectorize)
11603 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11606 case Instruction::ZExt:
11607 case Instruction::SExt:
11608 case Instruction::FPToUI:
11609 case Instruction::FPToSI:
11610 case Instruction::FPExt:
11611 case Instruction::PtrToInt:
11612 case Instruction::IntToPtr:
11613 case Instruction::SIToFP:
11614 case Instruction::UIToFP:
11615 case Instruction::Trunc:
11616 case Instruction::FPTrunc:
11617 case Instruction::BitCast: {
11618 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11619 std::make_pair(std::numeric_limits<unsigned>::min(),
11620 std::numeric_limits<unsigned>::max()));
11621 if (ShuffleOrOp == Instruction::ZExt ||
11622 ShuffleOrOp == Instruction::SExt) {
11623 CastMaxMinBWSizes = std::make_pair(
11624 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11626 std::min<unsigned>(
11629 }
else if (ShuffleOrOp == Instruction::Trunc) {
11630 CastMaxMinBWSizes = std::make_pair(
11631 std::max<unsigned>(
11634 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11637 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11638 ReuseShuffleIndices);
11639 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11644 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11645 if (ShuffleOrOp == Instruction::Trunc) {
11646 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11647 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11648 ShuffleOrOp == Instruction::UIToFP) {
11649 unsigned NumSignBits =
11652 APInt
Mask = DB->getDemandedBits(OpI);
11653 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11655 if (NumSignBits * 2 >=
11657 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11661 case Instruction::ICmp:
11662 case Instruction::FCmp: {
11665 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11666 ReuseShuffleIndices);
11675 "Commutative Predicate mismatch");
11685 if (
Cmp->getPredicate() != P0)
11692 if (ShuffleOrOp == Instruction::ICmp) {
11693 unsigned NumSignBits0 =
11695 if (NumSignBits0 * 2 >=
11697 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11698 unsigned NumSignBits1 =
11700 if (NumSignBits1 * 2 >=
11702 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11706 case Instruction::Select:
11707 case Instruction::FNeg:
11708 case Instruction::Add:
11709 case Instruction::FAdd:
11710 case Instruction::Sub:
11711 case Instruction::FSub:
11712 case Instruction::Mul:
11713 case Instruction::FMul:
11714 case Instruction::UDiv:
11715 case Instruction::SDiv:
11716 case Instruction::FDiv:
11717 case Instruction::URem:
11718 case Instruction::SRem:
11719 case Instruction::FRem:
11720 case Instruction::Shl:
11721 case Instruction::LShr:
11722 case Instruction::AShr:
11723 case Instruction::And:
11724 case Instruction::Or:
11725 case Instruction::Xor:
11726 case Instruction::Freeze: {
11727 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11728 ReuseShuffleIndices);
11730 dbgs() <<
"SLP: added a new TreeEntry "
11731 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11742 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11745 case Instruction::GetElementPtr: {
11746 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11747 ReuseShuffleIndices);
11748 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
11756 case Instruction::Store: {
11757 bool Consecutive = CurrentOrder.empty();
11760 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11761 ReuseShuffleIndices, CurrentOrder);
11763 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
11767 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
11770 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
11773 case Instruction::Call: {
11779 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11780 ReuseShuffleIndices);
11781 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
11795 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11799 case Instruction::ShuffleVector: {
11800 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11801 ReuseShuffleIndices);
11802 if (S.isAltShuffle()) {
11803 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
11808 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11822 "Expected different main/alternate predicates.");
11852 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11870 for (
const auto *Ty : ST->elements())
11871 if (Ty != *ST->element_begin())
11873 N *= ST->getNumElements();
11874 EltTy = *ST->element_begin();
11876 N *= AT->getNumElements();
11877 EltTy = AT->getElementType();
11880 N *= VT->getNumElements();
11881 EltTy = VT->getElementType();
11887 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
11888 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11889 VTSize != DL->getTypeStoreSizeInBits(
T))
11896 bool ResizeAllowed)
const {
11898 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
11905 Value *Vec = E0->getOperand(0);
11907 CurrentOrder.
clear();
11911 if (E0->getOpcode() == Instruction::ExtractValue) {
11923 unsigned E = VL.
size();
11924 if (!ResizeAllowed && NElts !=
E)
11927 unsigned MinIdx = NElts, MaxIdx = 0;
11932 if (Inst->getOperand(0) != Vec)
11940 const unsigned ExtIdx = *Idx;
11941 if (ExtIdx >= NElts)
11943 Indices[
I] = ExtIdx;
11944 if (MinIdx > ExtIdx)
11946 if (MaxIdx < ExtIdx)
11949 if (MaxIdx - MinIdx + 1 >
E)
11951 if (MaxIdx + 1 <=
E)
11955 bool ShouldKeepOrder =
true;
11962 for (
unsigned I = 0;
I <
E; ++
I) {
11965 const unsigned ExtIdx = Indices[
I] - MinIdx;
11966 if (CurrentOrder[ExtIdx] !=
E) {
11967 CurrentOrder.
clear();
11970 ShouldKeepOrder &= ExtIdx ==
I;
11971 CurrentOrder[ExtIdx] =
I;
11973 if (ShouldKeepOrder)
11974 CurrentOrder.
clear();
11976 return ShouldKeepOrder;
11979bool BoUpSLP::areAllUsersVectorized(
11980 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
11981 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
11982 all_of(
I->users(), [
this](User *U) {
11983 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
11984 (isa<ExtractElementInst>(U) && MustGather.contains(U));
11988void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
11989 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
11990 SmallVectorImpl<Value *> *OpScalars,
11991 SmallVectorImpl<Value *> *AltScalars)
const {
11992 unsigned Sz = Scalars.size();
11994 SmallVector<int> OrderMask;
11995 if (!ReorderIndices.empty())
11997 for (
unsigned I = 0;
I < Sz; ++
I) {
11999 if (!ReorderIndices.empty())
12000 Idx = OrderMask[
I];
12004 if (IsAltOp(OpInst)) {
12005 Mask[
I] = Sz + Idx;
12014 if (!ReuseShuffleIndices.
empty()) {
12016 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12017 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12019 Mask.swap(NewMask);
12026 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12036 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12045 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12046 "CmpInst expected to match either main or alternate predicate or "
12048 return MainP !=
P && MainP != SwappedP;
12050 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12055 const auto *Op0 =
Ops.front();
12068 return CI->getValue().isPowerOf2();
12074 return CI->getValue().isNegatedPowerOf2();
12079 if (IsConstant && IsUniform)
12081 else if (IsConstant)
12083 else if (IsUniform)
12095class BaseShuffleAnalysis {
12097 Type *ScalarTy =
nullptr;
12099 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12107 unsigned getVF(
Value *V)
const {
12108 assert(V &&
"V cannot be nullptr");
12110 "V does not have FixedVectorType");
12111 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12113 unsigned VNumElements =
12115 assert(VNumElements > ScalarTyNumElements &&
12116 "the number of elements of V is not large enough");
12117 assert(VNumElements % ScalarTyNumElements == 0 &&
12118 "the number of elements of V is not a vectorized value");
12119 return VNumElements / ScalarTyNumElements;
12125 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12127 int Limit =
Mask.size();
12139 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12140 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12153 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12154 ArrayRef<int> ExtMask) {
12155 unsigned VF =
Mask.size();
12157 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12160 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12164 Mask.swap(NewMask);
12200 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12201 bool SinglePermute) {
12203 ShuffleVectorInst *IdentityOp =
nullptr;
12204 SmallVector<int> IdentityMask;
12213 if (isIdentityMask(Mask, SVTy,
false)) {
12214 if (!IdentityOp || !SinglePermute ||
12215 (isIdentityMask(Mask, SVTy,
true) &&
12217 IdentityMask.
size()))) {
12222 IdentityMask.
assign(Mask);
12242 if (SV->isZeroEltSplat()) {
12244 IdentityMask.
assign(Mask);
12246 int LocalVF =
Mask.size();
12249 LocalVF = SVOpTy->getNumElements();
12253 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12255 ExtMask[Idx] = SV->getMaskValue(
I);
12265 if (!IsOp1Undef && !IsOp2Undef) {
12267 for (
int &
I : Mask) {
12270 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12276 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12277 combineMasks(LocalVF, ShuffleMask, Mask);
12278 Mask.swap(ShuffleMask);
12280 Op = SV->getOperand(0);
12282 Op = SV->getOperand(1);
12285 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12290 "Expected masks of same sizes.");
12295 Mask.swap(IdentityMask);
12297 return SinglePermute &&
12300 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12301 Shuffle->isZeroEltSplat() &&
12305 Shuffle->getShuffleMask()[
P.index()] == 0;
12318 template <
typename T,
typename ShuffleBuilderTy>
12319 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12320 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12321 assert(V1 &&
"Expected at least one vector value.");
12323 SmallVector<int> NewMask(Mask);
12324 if (ScalarTyNumElements != 1) {
12330 Builder.resizeToMatch(V1, V2);
12331 int VF =
Mask.size();
12333 VF = FTy->getNumElements();
12344 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12346 CombinedMask1[
I] =
Mask[
I];
12348 CombinedMask2[
I] =
Mask[
I] - VF;
12355 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12356 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12362 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12365 ExtMask1[Idx] = SV1->getMaskValue(
I);
12369 ->getNumElements(),
12370 ExtMask1, UseMask::SecondArg);
12371 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12372 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12375 ExtMask2[Idx] = SV2->getMaskValue(
I);
12379 ->getNumElements(),
12380 ExtMask2, UseMask::SecondArg);
12381 if (SV1->getOperand(0)->getType() ==
12382 SV2->getOperand(0)->getType() &&
12383 SV1->getOperand(0)->getType() != SV1->getType() &&
12386 Op1 = SV1->getOperand(0);
12387 Op2 = SV2->getOperand(0);
12388 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12389 int LocalVF = ShuffleMask1.size();
12391 LocalVF = FTy->getNumElements();
12392 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12393 CombinedMask1.swap(ShuffleMask1);
12394 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12395 LocalVF = ShuffleMask2.size();
12397 LocalVF = FTy->getNumElements();
12398 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12399 CombinedMask2.swap(ShuffleMask2);
12402 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12403 Builder.resizeToMatch(Op1, Op2);
12405 ->getElementCount()
12406 .getKnownMinValue(),
12408 ->getElementCount()
12409 .getKnownMinValue());
12410 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12413 "Expected undefined mask element");
12414 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12423 return Builder.createIdentity(Op1);
12424 return Builder.createShuffleVector(
12429 return Builder.createPoison(
12431 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12432 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12435 return Builder.createShuffleVector(V1, NewMask);
12436 return Builder.createIdentity(V1);
12442 ArrayRef<int> Mask) {
12451static std::pair<InstructionCost, InstructionCost>
12462 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12471 ScalarCost =
TTI.getPointersChainCost(
12472 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12476 for (
Value *V : Ptrs) {
12477 if (V == BasePtr) {
12486 if (!
Ptr || !
Ptr->hasOneUse())
12490 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12495 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12496 TTI::PointersChainInfo::getKnownStride(),
12506 [](
const Value *V) {
12508 return Ptr && !
Ptr->hasAllConstantIndices();
12510 ? TTI::PointersChainInfo::getUnknownStride()
12511 : TTI::PointersChainInfo::getKnownStride();
12514 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
12518 if (It != Ptrs.
end())
12523 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
12524 BaseGEP->getPointerOperand(), Indices, VecTy,
12529 return std::make_pair(ScalarCost, VecCost);
12532void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12533 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12534 "Expected gather node without reordering.");
12536 SmallSet<size_t, 2> LoadKeyUsed;
12540 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12545 return VectorizableTree[Idx]->isSame(TE.Scalars);
12549 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
12554 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
12555 if (LIt != LoadsMap.
end()) {
12556 for (LoadInst *RLI : LIt->second) {
12558 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
12562 for (LoadInst *RLI : LIt->second) {
12564 LI->getPointerOperand(), *TLI)) {
12569 if (LIt->second.size() > 2) {
12571 hash_value(LIt->second.back()->getPointerOperand());
12580 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12581 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12582 bool IsOrdered =
true;
12583 unsigned NumInstructions = 0;
12587 size_t Key = 1, Idx = 1;
12595 auto &Container = SortedValues[
Key];
12596 if (IsOrdered && !KeyToIndex.
contains(V) &&
12599 ((Container.contains(Idx) &&
12600 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
12601 (!Container.empty() && !Container.contains(Idx) &&
12602 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12604 auto &KTI = KeyToIndex[
V];
12606 Container[Idx].push_back(V);
12611 if (!IsOrdered && NumInstructions > 1) {
12613 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12614 for (
const auto &
D : SortedValues) {
12615 for (
const auto &
P :
D.second) {
12617 for (
Value *V :
P.second) {
12618 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
12619 for (
auto [K, Idx] :
enumerate(Indices)) {
12620 TE.ReorderIndices[Cnt +
K] = Idx;
12621 TE.Scalars[Cnt +
K] =
V;
12623 Sz += Indices.
size();
12624 Cnt += Indices.
size();
12628 *TTI,
TE.Scalars.front()->getType(), Sz);
12632 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12640 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12645 auto *ScalarTy =
TE.Scalars.front()->getType();
12647 for (
auto [Idx, Sz] : SubVectors) {
12654 int Sz =
TE.Scalars.size();
12655 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
12656 TE.ReorderIndices.end());
12662 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12666 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12669 VecTy, ReorderMask);
12675 DemandedElts.clearBit(
I);
12677 ReorderMask[
I] =
I;
12679 ReorderMask[
I] =
I + Sz;
12685 if (!DemandedElts.isAllOnes())
12687 if (
Cost >= BVCost) {
12688 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
12690 TE.ReorderIndices.clear();
12697 const InstructionsState &S,
12703 return V->getType()->getScalarType()->isFloatingPointTy();
12705 "Can only convert to FMA for floating point types");
12706 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12711 for (
Value *V : VL) {
12715 if (S.isCopyableElement(
I))
12717 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12718 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12721 FMF &= FPCI->getFastMathFlags();
12725 if (!CheckForContractable(VL))
12728 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12735 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12737 if (!CheckForContractable(
Operands.front()))
12745 for (
Value *V : VL) {
12749 if (!S.isCopyableElement(
I))
12751 FMF &= FPCI->getFastMathFlags();
12752 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12756 if (S.isCopyableElement(V))
12759 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
12761 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
12768 FMF &= FPCI->getFastMathFlags();
12769 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12779 BaseGraphSize = VectorizableTree.size();
12781 class GraphTransformModeRAAI {
12782 bool &SavedIsGraphTransformMode;
12785 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
12786 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12787 IsGraphTransformMode =
true;
12789 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
12790 } TransformContext(IsGraphTransformMode);
12799 const InstructionsState &S) {
12803 I2->getOperand(
Op));
12805 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
12807 [](
const std::pair<Value *, Value *> &
P) {
12817 TreeEntry &E = *VectorizableTree[Idx];
12819 reorderGatherNode(E);
12824 constexpr unsigned VFLimit = 16;
12825 bool ForceLoadGather =
12826 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12827 return TE->isGather() && TE->hasState() &&
12828 TE->getOpcode() == Instruction::Load &&
12829 TE->getVectorFactor() < VFLimit;
12835 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
12844 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
12845 if (E.hasState()) {
12847 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12848 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12849 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12850 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12851 return is_contained(TEs, TE);
12858 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12859 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12860 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12861 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12862 return is_contained(TEs, TE);
12870 if (It != E.Scalars.end()) {
12872 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12873 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12874 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12875 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12876 return is_contained(TEs, TE);
12886 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
12887 TreeEntry &
E = *VectorizableTree[Idx];
12888 if (
E.isGather()) {
12891 unsigned MinVF =
getMinVF(2 * Sz);
12894 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
12895 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
12901 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
12904 if (CheckForSameVectorNodes(
E))
12908 unsigned StartIdx = 0;
12909 unsigned End = VL.
size();
12911 *TTI, VL.
front()->getType(), VL.
size() - 1);
12913 *TTI, VL.
front()->getType(), VF - 1)) {
12914 if (StartIdx + VF > End)
12917 bool AllStrided =
true;
12918 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
12923 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
12930 bool IsSplat =
isSplat(Slice);
12931 bool IsTwoRegisterSplat =
true;
12932 if (IsSplat && VF == 2) {
12935 IsTwoRegisterSplat = NumRegs2VF == 2;
12937 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
12945 (S.getOpcode() == Instruction::Load &&
12947 (S.getOpcode() != Instruction::Load &&
12953 if ((!UserIgnoreList ||
E.Idx != 0) &&
12954 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
12963 if (S.getOpcode() == Instruction::Load) {
12966 StridedPtrInfo SPtrInfo;
12968 PointerOps, SPtrInfo);
12979 if (UserIgnoreList &&
E.Idx == 0)
12984 }
else if (S.getOpcode() == Instruction::ExtractElement ||
12985 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
12987 !CheckOperandsProfitability(
13004 if (VF == 2 && AllStrided && Slices.
size() > 2)
13006 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13007 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13008 if (StartIdx == Cnt)
13009 StartIdx = Cnt + Sz;
13010 if (End == Cnt + Sz)
13013 for (
auto [Cnt, Sz] : Slices) {
13015 const TreeEntry *SameTE =
nullptr;
13017 It != Slice.
end()) {
13019 SameTE = getSameValuesTreeEntry(*It, Slice);
13021 unsigned PrevSize = VectorizableTree.size();
13022 [[maybe_unused]]
unsigned PrevEntriesSize =
13023 LoadEntriesToVectorize.size();
13024 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13025 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13026 VectorizableTree[PrevSize]->isGather() &&
13027 VectorizableTree[PrevSize]->hasState() &&
13028 VectorizableTree[PrevSize]->getOpcode() !=
13029 Instruction::ExtractElement &&
13031 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13033 VectorizableTree.pop_back();
13034 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13035 "LoadEntriesToVectorize expected to remain the same");
13038 AddCombinedNode(PrevSize, Cnt, Sz);
13042 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13043 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13045 E.ReorderIndices.clear();
13050 switch (
E.getOpcode()) {
13051 case Instruction::Load: {
13054 if (
E.State != TreeEntry::Vectorize)
13056 Type *ScalarTy =
E.getMainOp()->getType();
13062 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13063 SmallVector<int>
Mask;
13067 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13068 BaseLI->getPointerAddressSpace(),
CostKind,
13072 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13073 false, CommonAlignment,
CostKind, BaseLI);
13078 ->getPointerOperand()
13080 StridedPtrInfo SPtrInfo;
13081 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13082 SPtrInfo.Ty = VecTy;
13083 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13084 E.State = TreeEntry::StridedVectorize;
13089 case Instruction::Store: {
13097 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13098 SmallVector<int>
Mask;
13102 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13103 BaseSI->getPointerAddressSpace(),
CostKind,
13107 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13108 false, CommonAlignment,
CostKind, BaseSI);
13109 if (StridedCost < OriginalVecCost)
13112 E.State = TreeEntry::StridedVectorize;
13113 }
else if (!
E.ReorderIndices.empty()) {
13115 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13117 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13118 if (
Mask.size() < 4)
13122 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13123 TTI.isLegalInterleavedAccessType(
13124 VecTy, Factor, BaseSI->getAlign(),
13125 BaseSI->getPointerAddressSpace()))
13131 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13132 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13133 if (InterleaveFactor != 0)
13134 E.setInterleave(InterleaveFactor);
13138 case Instruction::Select: {
13139 if (
E.State != TreeEntry::Vectorize)
13145 E.CombinedOp = TreeEntry::MinMax;
13146 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13147 if (SelectOnly && CondEntry->UserTreeIndex &&
13148 CondEntry->State == TreeEntry::Vectorize) {
13150 CondEntry->State = TreeEntry::CombinedVectorize;
13154 case Instruction::FSub:
13155 case Instruction::FAdd: {
13157 if (
E.State != TreeEntry::Vectorize ||
13158 !
E.getOperations().isAddSubLikeOp())
13164 E.CombinedOp = TreeEntry::FMulAdd;
13165 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13166 if (FMulEntry->UserTreeIndex &&
13167 FMulEntry->State == TreeEntry::Vectorize) {
13169 FMulEntry->State = TreeEntry::CombinedVectorize;
13178 if (LoadEntriesToVectorize.empty()) {
13180 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13181 VectorizableTree.front()->getOpcode() == Instruction::Load)
13184 constexpr unsigned SmallTree = 3;
13185 constexpr unsigned SmallVF = 2;
13186 if ((VectorizableTree.size() <= SmallTree &&
13187 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13188 (VectorizableTree.size() <= 2 && UserIgnoreList))
13191 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13195 [](
const std::unique_ptr<TreeEntry> &TE) {
13196 return TE->isGather() &&
TE->hasState() &&
13197 TE->getOpcode() == Instruction::Load &&
13205 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13209 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13210 TreeEntry &
E = *
TE;
13211 if (
E.isGather() &&
13212 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13213 (!
E.hasState() &&
any_of(
E.Scalars,
13215 return isa<LoadInst>(V) &&
13216 !isVectorized(V) &&
13217 !isDeleted(cast<Instruction>(V));
13220 for (
Value *V :
E.Scalars) {
13227 *
this, V, *DL, *SE, *TTI,
13228 GatheredLoads[std::make_tuple(
13236 if (!GatheredLoads.
empty())
13237 tryToVectorizeGatheredLoads(GatheredLoads);
13247 bool IsFinalized =
false;
13260 bool SameNodesEstimated =
true;
13263 if (Ty->getScalarType()->isPointerTy()) {
13267 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13268 Ty->getScalarType());
13286 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13289 count(VL, *It) > 1 &&
13291 if (!NeedShuffle) {
13294 return TTI.getShuffleCost(
13299 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13300 CostKind, std::distance(VL.
begin(), It),
13306 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13309 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13313 VecTy, ShuffleMask, CostKind,
13317 return GatherCost +
13320 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13328 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13329 unsigned NumParts) {
13330 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13332 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13333 auto *EE = dyn_cast<ExtractElementInst>(V);
13336 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13339 return std::max(Sz, VecTy->getNumElements());
13346 -> std::optional<TTI::ShuffleKind> {
13347 if (NumElts <= EltsPerVector)
13348 return std::nullopt;
13350 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13352 if (I == PoisonMaskElem)
13354 return std::min(S, I);
13357 int OffsetReg1 = OffsetReg0;
13361 int FirstRegId = -1;
13362 Indices.assign(1, OffsetReg0);
13366 int Idx =
I - OffsetReg0;
13368 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13369 if (FirstRegId < 0)
13370 FirstRegId = RegId;
13371 RegIndices.
insert(RegId);
13372 if (RegIndices.
size() > 2)
13373 return std::nullopt;
13374 if (RegIndices.
size() == 2) {
13376 if (Indices.
size() == 1) {
13379 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13380 [&](
int S,
int I) {
13381 if (I == PoisonMaskElem)
13383 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13384 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13385 if (RegId == FirstRegId)
13387 return std::min(S, I);
13390 unsigned Index = OffsetReg1 % NumElts;
13391 Indices.push_back(Index);
13392 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13394 Idx =
I - OffsetReg1;
13396 I = (Idx % NumElts) % EltsPerVector +
13397 (RegId == FirstRegId ? 0 : EltsPerVector);
13399 return ShuffleKind;
13407 if (!ShuffleKinds[Part])
13410 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13415 std::optional<TTI::ShuffleKind> RegShuffleKind =
13416 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13417 if (!RegShuffleKind) {
13420 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13433 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13434 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13435 assert((Idx + SubVecSize) <= BaseVF &&
13436 "SK_ExtractSubvector index out of range");
13446 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13447 if (OriginalCost < Cost)
13448 Cost = OriginalCost;
13455 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13457 unsigned SliceSize) {
13458 if (SameNodesEstimated) {
13464 if ((InVectors.size() == 2 &&
13468 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13471 "Expected all poisoned elements.");
13473 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13478 Cost += createShuffle(InVectors.front(),
13479 InVectors.size() == 1 ?
nullptr : InVectors.back(),
13481 transformMaskAfterShuffle(CommonMask, CommonMask);
13482 }
else if (InVectors.size() == 2) {
13483 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13484 transformMaskAfterShuffle(CommonMask, CommonMask);
13486 SameNodesEstimated =
false;
13487 if (!E2 && InVectors.size() == 1) {
13488 unsigned VF = E1.getVectorFactor();
13490 VF = std::max(VF, getVF(V1));
13493 VF = std::max(VF, E->getVectorFactor());
13495 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13497 CommonMask[Idx] = Mask[Idx] + VF;
13498 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13499 transformMaskAfterShuffle(CommonMask, CommonMask);
13501 auto P = InVectors.front();
13502 Cost += createShuffle(&E1, E2, Mask);
13503 unsigned VF = Mask.size();
13509 VF = std::max(VF, E->getVectorFactor());
13511 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13513 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13514 Cost += createShuffle(
P, InVectors.front(), CommonMask);
13515 transformMaskAfterShuffle(CommonMask, CommonMask);
13519 class ShuffleCostBuilder {
13522 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13524 return Mask.empty() ||
13525 (VF == Mask.size() &&
13533 ~ShuffleCostBuilder() =
default;
13539 if (isEmptyOrIdentity(Mask, VF))
13548 if (isEmptyOrIdentity(Mask, VF))
13557 void resizeToMatch(
Value *&,
Value *&)
const {}
13567 ShuffleCostBuilder Builder(TTI);
13570 unsigned CommonVF = Mask.size();
13572 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13576 Type *EScalarTy = E.Scalars.front()->getType();
13577 bool IsSigned =
true;
13578 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13580 IsSigned = It->second.second;
13582 if (EScalarTy != ScalarTy) {
13583 unsigned CastOpcode = Instruction::Trunc;
13584 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13585 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13587 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13588 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
13598 Type *EScalarTy = VecTy->getElementType();
13599 if (EScalarTy != ScalarTy) {
13601 unsigned CastOpcode = Instruction::Trunc;
13602 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13603 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13605 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13606 return TTI.getCastInstrCost(
13612 if (!V1 && !V2 && !P2.
isNull()) {
13615 unsigned VF = E->getVectorFactor();
13617 CommonVF = std::max(VF, E2->getVectorFactor());
13620 return Idx < 2 * static_cast<int>(CommonVF);
13622 "All elements in mask must be less than 2 * CommonVF.");
13623 if (E->Scalars.size() == E2->Scalars.size()) {
13627 for (
int &Idx : CommonMask) {
13630 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13632 else if (Idx >=
static_cast<int>(CommonVF))
13633 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13637 CommonVF = E->Scalars.size();
13638 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13639 GetNodeMinBWAffectedCost(*E2, CommonVF);
13641 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13642 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13645 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13646 }
else if (!V1 && P2.
isNull()) {
13649 unsigned VF = E->getVectorFactor();
13653 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13654 "All elements in mask must be less than CommonVF.");
13655 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13657 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13658 for (
int &Idx : CommonMask) {
13662 CommonVF = E->Scalars.size();
13663 }
else if (
unsigned Factor = E->getInterleaveFactor();
13664 Factor > 0 && E->Scalars.size() != Mask.size() &&
13668 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13670 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13673 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13674 CommonVF == CommonMask.size() &&
13676 [](
const auto &&
P) {
13678 static_cast<unsigned>(
P.value()) !=
P.index();
13686 }
else if (V1 && P2.
isNull()) {
13688 ExtraCost += GetValueMinBWAffectedCost(V1);
13689 CommonVF = getVF(V1);
13692 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13693 "All elements in mask must be less than CommonVF.");
13694 }
else if (V1 && !V2) {
13696 unsigned VF = getVF(V1);
13698 CommonVF = std::max(VF, E2->getVectorFactor());
13701 return Idx < 2 * static_cast<int>(CommonVF);
13703 "All elements in mask must be less than 2 * CommonVF.");
13704 if (E2->Scalars.size() == VF && VF != CommonVF) {
13706 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13707 for (
int &Idx : CommonMask) {
13710 if (Idx >=
static_cast<int>(CommonVF))
13711 Idx = E2Mask[Idx - CommonVF] + VF;
13715 ExtraCost += GetValueMinBWAffectedCost(V1);
13717 ExtraCost += GetNodeMinBWAffectedCost(
13718 *E2, std::min(CommonVF, E2->getVectorFactor()));
13719 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13720 }
else if (!V1 && V2) {
13722 unsigned VF = getVF(V2);
13724 CommonVF = std::max(VF, E1->getVectorFactor());
13727 return Idx < 2 * static_cast<int>(CommonVF);
13729 "All elements in mask must be less than 2 * CommonVF.");
13730 if (E1->Scalars.size() == VF && VF != CommonVF) {
13732 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
13733 for (
int &Idx : CommonMask) {
13736 if (Idx >=
static_cast<int>(CommonVF))
13737 Idx = E1Mask[Idx - CommonVF] + VF;
13743 ExtraCost += GetNodeMinBWAffectedCost(
13744 *E1, std::min(CommonVF, E1->getVectorFactor()));
13746 ExtraCost += GetValueMinBWAffectedCost(V2);
13747 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13749 assert(V1 && V2 &&
"Expected both vectors.");
13750 unsigned VF = getVF(V1);
13751 CommonVF = std::max(VF, getVF(V2));
13754 return Idx < 2 * static_cast<int>(CommonVF);
13756 "All elements in mask must be less than 2 * CommonVF.");
13758 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13761 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13766 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13769 InVectors.front() =
13771 if (InVectors.size() == 2)
13772 InVectors.pop_back();
13773 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13774 V1, V2, CommonMask, Builder, ScalarTy);
13781 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13782 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13783 CheckedExtracts(CheckedExtracts) {}
13785 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13786 unsigned NumParts,
bool &UseVecBaseAsInput) {
13787 UseVecBaseAsInput =
false;
13790 Value *VecBase =
nullptr;
13792 if (!E->ReorderIndices.empty()) {
13794 E->ReorderIndices.end());
13799 bool PrevNodeFound =
any_of(
13800 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13801 [&](
const std::unique_ptr<TreeEntry> &TE) {
13802 return ((TE->hasState() && !TE->isAltShuffle() &&
13803 TE->getOpcode() == Instruction::ExtractElement) ||
13805 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13806 return VL.size() > Data.index() &&
13807 (Mask[Data.index()] == PoisonMaskElem ||
13808 isa<UndefValue>(VL[Data.index()]) ||
13809 Data.value() == VL[Data.index()]);
13817 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13831 VecBase = EE->getVectorOperand();
13832 UniqueBases.
insert(VecBase);
13834 if (!CheckedExtracts.
insert(V).second ||
13838 return isa<GetElementPtrInst>(U) &&
13839 !R.areAllUsersVectorized(cast<Instruction>(U),
13847 unsigned Idx = *EEIdx;
13849 if (EE->hasOneUse() || !PrevNodeFound) {
13855 Cost -= TTI.getExtractWithExtendCost(
13856 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13859 Cost += TTI.getCastInstrCost(
13860 Ext->getOpcode(), Ext->getType(), EE->getType(),
13865 APInt &DemandedElts =
13866 VectorOpsToExtracts
13869 .first->getSecond();
13870 DemandedElts.
setBit(Idx);
13873 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13875 DemandedElts,
false,
13883 if (!PrevNodeFound)
13884 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13887 transformMaskAfterShuffle(CommonMask, CommonMask);
13888 SameNodesEstimated =
false;
13889 if (NumParts != 1 && UniqueBases.
size() != 1) {
13890 UseVecBaseAsInput =
true;
13898 std::optional<InstructionCost>
13902 return std::nullopt;
13906 IsFinalized =
false;
13907 CommonMask.clear();
13910 VectorizedVals.clear();
13911 SameNodesEstimated =
true;
13917 return Idx < static_cast<int>(E1.getVectorFactor());
13919 "Expected single vector shuffle mask.");
13923 if (InVectors.empty()) {
13924 CommonMask.assign(Mask.begin(), Mask.end());
13925 InVectors.assign({&E1, &E2});
13928 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
13934 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13935 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13938 if (InVectors.empty()) {
13939 CommonMask.assign(Mask.begin(), Mask.end());
13940 InVectors.assign(1, &E1);
13943 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
13949 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13950 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
13951 if (!SameNodesEstimated && InVectors.size() == 1)
13952 InVectors.emplace_back(&E1);
13958 assert(InVectors.size() == 1 &&
13965 ->getOrdered(
P.index()));
13966 return EI->getVectorOperand() == V1 ||
13967 EI->getVectorOperand() == V2;
13969 "Expected extractelement vectors.");
13973 if (InVectors.empty()) {
13974 assert(CommonMask.empty() && !ForExtracts &&
13975 "Expected empty input mask/vectors.");
13976 CommonMask.assign(Mask.begin(), Mask.end());
13977 InVectors.assign(1, V1);
13983 !CommonMask.empty() &&
13987 ->getOrdered(
P.index());
13989 return P.value() == Mask[
P.index()] ||
13994 return EI->getVectorOperand() == V1;
13996 "Expected only tree entry for extractelement vectors.");
13999 assert(!InVectors.empty() && !CommonMask.empty() &&
14000 "Expected only tree entries from extracts/reused buildvectors.");
14001 unsigned VF = getVF(V1);
14002 if (InVectors.size() == 2) {
14003 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14004 transformMaskAfterShuffle(CommonMask, CommonMask);
14005 VF = std::max<unsigned>(VF, CommonMask.size());
14006 }
else if (
const auto *InTE =
14007 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14008 VF = std::max(VF, InTE->getVectorFactor());
14012 ->getNumElements());
14014 InVectors.push_back(V1);
14015 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14017 CommonMask[Idx] = Mask[Idx] + VF;
14020 Value *Root =
nullptr) {
14021 Cost += getBuildVectorCost(VL, Root);
14025 unsigned VF = VL.
size();
14027 VF = std::min(VF, MaskVF);
14028 Type *VLScalarTy = VL.
front()->getType();
14052 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14058 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14063 IsFinalized =
true;
14066 if (InVectors.
size() == 2)
14067 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14069 Cost += createShuffle(Vec,
nullptr, CommonMask);
14070 transformMaskAfterShuffle(CommonMask, CommonMask);
14072 "Expected vector length for the final value before action.");
14075 Cost += createShuffle(V1, V2, Mask);
14078 InVectors.
front() = V;
14080 if (!SubVectors.empty()) {
14082 if (InVectors.
size() == 2)
14083 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14085 Cost += createShuffle(Vec,
nullptr, CommonMask);
14086 transformMaskAfterShuffle(CommonMask, CommonMask);
14088 if (!SubVectorsMask.
empty()) {
14090 "Expected same size of masks for subvectors and common mask.");
14092 copy(SubVectorsMask, SVMask.begin());
14093 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14096 I1 = I2 + CommonMask.
size();
14103 for (
auto [
E, Idx] : SubVectors) {
14104 Type *EScalarTy =
E->Scalars.front()->getType();
14105 bool IsSigned =
true;
14106 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14109 IsSigned = It->second.second;
14111 if (ScalarTy != EScalarTy) {
14112 unsigned CastOpcode = Instruction::Trunc;
14113 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14114 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14116 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14117 Cost += TTI.getCastInstrCost(
14126 if (!CommonMask.
empty()) {
14127 std::iota(std::next(CommonMask.
begin(), Idx),
14128 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14134 if (!ExtMask.
empty()) {
14135 if (CommonMask.
empty()) {
14139 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14142 NewMask[
I] = CommonMask[ExtMask[
I]];
14144 CommonMask.
swap(NewMask);
14147 if (CommonMask.
empty()) {
14148 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14152 createShuffle(InVectors.
front(),
14153 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14158 assert((IsFinalized || CommonMask.empty()) &&
14159 "Shuffle construction must be finalized.");
14163const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14164 unsigned Idx)
const {
14165 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14166 assert(
Op->isSame(E->getOperand(Idx)) &&
"Operands mismatch!");
14171 if (TE.State == TreeEntry::ScatterVectorize ||
14172 TE.State == TreeEntry::StridedVectorize)
14174 if (TE.State == TreeEntry::CompressVectorize)
14176 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14177 !TE.isAltShuffle()) {
14178 if (TE.ReorderIndices.empty())
14190 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14200 auto It = MinBWs.find(
E);
14201 Type *OrigScalarTy = ScalarTy;
14202 if (It != MinBWs.end()) {
14209 unsigned EntryVF =
E->getVectorFactor();
14212 if (
E->isGather()) {
14218 ScalarTy = VL.
front()->getType();
14219 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14220 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14222 if (
E->State == TreeEntry::SplitVectorize) {
14223 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14224 "Expected exactly 2 combined entries.");
14225 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14227 if (
E->ReorderIndices.empty()) {
14230 E->CombinedEntriesWithIndices.back().second,
14233 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14234 ->getVectorFactor()));
14236 unsigned CommonVF =
14237 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14238 ->getVectorFactor(),
14239 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14240 ->getVectorFactor());
14245 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14249 SmallVector<int>
Mask;
14250 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14251 (
E->State != TreeEntry::StridedVectorize ||
14253 SmallVector<int> NewMask;
14254 if (
E->getOpcode() == Instruction::Store) {
14256 NewMask.
resize(
E->ReorderIndices.size());
14263 if (!
E->ReuseShuffleIndices.empty())
14268 assert((
E->State == TreeEntry::Vectorize ||
14269 E->State == TreeEntry::ScatterVectorize ||
14270 E->State == TreeEntry::StridedVectorize ||
14271 E->State == TreeEntry::CompressVectorize) &&
14272 "Unhandled state");
14275 (
E->getOpcode() == Instruction::GetElementPtr &&
14276 E->getMainOp()->getType()->isPointerTy()) ||
14277 E->hasCopyableElements()) &&
14280 unsigned ShuffleOrOp =
14281 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14282 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14283 ShuffleOrOp =
E->CombinedOp;
14284 SmallSetVector<Value *, 16> UniqueValues(VL.
begin(), VL.
end());
14285 const unsigned Sz = UniqueValues.size();
14286 SmallBitVector UsedScalars(Sz,
false);
14287 for (
unsigned I = 0;
I < Sz; ++
I) {
14289 !
E->isCopyableElement(UniqueValues[
I]) &&
14290 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14292 UsedScalars.set(
I);
14294 auto GetCastContextHint = [&](
Value *
V) {
14296 return getCastContextHint(*OpTEs.front());
14297 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14298 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14299 !SrcState.isAltShuffle())
14312 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14314 for (
unsigned I = 0;
I < Sz; ++
I) {
14315 if (UsedScalars.test(
I))
14317 ScalarCost += ScalarEltCost(
I);
14326 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14328 if (!EI.UserTE->hasState() ||
14329 EI.UserTE->getOpcode() != Instruction::Select ||
14331 auto UserBWIt = MinBWs.find(EI.UserTE);
14332 Type *UserScalarTy =
14333 (EI.UserTE->isGather() ||
14334 EI.UserTE->State == TreeEntry::SplitVectorize)
14335 ? EI.UserTE->Scalars.front()->getType()
14336 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14337 if (UserBWIt != MinBWs.end())
14339 UserBWIt->second.first);
14340 if (ScalarTy != UserScalarTy) {
14341 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14342 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14343 unsigned VecOpcode;
14345 if (BWSz > SrcBWSz)
14346 VecOpcode = Instruction::Trunc;
14349 It->second.second ? Instruction::SExt : Instruction::ZExt;
14351 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14356 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14357 ScalarCost,
"Calculated costs for Tree"));
14358 return VecCost - ScalarCost;
14363 assert((
E->State == TreeEntry::Vectorize ||
14364 E->State == TreeEntry::StridedVectorize ||
14365 E->State == TreeEntry::CompressVectorize) &&
14366 "Entry state expected to be Vectorize, StridedVectorize or "
14367 "MaskedLoadCompressVectorize here.");
14371 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14372 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14373 "Calculated GEPs cost for Tree"));
14375 return VecCost - ScalarCost;
14382 Type *CanonicalType = Ty;
14388 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14389 {CanonicalType, CanonicalType});
14391 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14394 if (VI && SelectOnly) {
14396 "Expected only for scalar type.");
14399 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14400 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14401 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14405 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14410 switch (ShuffleOrOp) {
14411 case Instruction::PHI: {
14414 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14415 for (
Value *V : UniqueValues) {
14421 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14425 if (
const TreeEntry *OpTE =
14427 if (CountedOps.
insert(OpTE).second &&
14428 !OpTE->ReuseShuffleIndices.empty())
14429 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14430 OpTE->Scalars.size());
14433 return CommonCost - ScalarCost;
14435 case Instruction::ExtractValue:
14436 case Instruction::ExtractElement: {
14437 APInt DemandedElts;
14439 auto GetScalarCost = [&](
unsigned Idx) {
14445 if (ShuffleOrOp == Instruction::ExtractElement) {
14447 SrcVecTy = EE->getVectorOperandType();
14450 Type *AggregateTy = EV->getAggregateOperand()->getType();
14453 NumElts = ATy->getNumElements();
14459 if (
I->hasOneUse()) {
14469 Cost -= TTI->getCastInstrCost(
14470 Ext->getOpcode(),
Ext->getType(),
I->getType(),
14475 if (DemandedElts.
isZero())
14481 return CommonCost - (DemandedElts.
isZero()
14483 : TTI.getScalarizationOverhead(
14484 SrcVecTy, DemandedElts,
false,
14487 return GetCostDiff(GetScalarCost, GetVectorCost);
14489 case Instruction::InsertElement: {
14490 assert(
E->ReuseShuffleIndices.empty() &&
14491 "Unique insertelements only are expected.");
14493 unsigned const NumElts = SrcVecTy->getNumElements();
14494 unsigned const NumScalars = VL.
size();
14500 unsigned OffsetEnd = OffsetBeg;
14501 InsertMask[OffsetBeg] = 0;
14504 if (OffsetBeg > Idx)
14506 else if (OffsetEnd < Idx)
14508 InsertMask[Idx] =
I + 1;
14511 if (NumOfParts > 0 && NumOfParts < NumElts)
14512 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14513 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14515 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14516 unsigned InsertVecSz = std::min<unsigned>(
14518 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14519 bool IsWholeSubvector =
14520 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14524 if (OffsetBeg + InsertVecSz > VecSz) {
14527 InsertVecSz = VecSz;
14532 SmallVector<int>
Mask;
14533 if (!
E->ReorderIndices.empty()) {
14538 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14540 bool IsIdentity =
true;
14542 Mask.swap(PrevMask);
14543 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14545 DemandedElts.
setBit(InsertIdx);
14546 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14547 Mask[InsertIdx - OffsetBeg] =
I;
14549 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14563 InsertVecTy, Mask);
14565 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14571 SmallBitVector InMask =
14573 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14574 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14575 if (InsertVecSz != VecSz) {
14580 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
14582 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
14586 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14595 case Instruction::ZExt:
14596 case Instruction::SExt:
14597 case Instruction::FPToUI:
14598 case Instruction::FPToSI:
14599 case Instruction::FPExt:
14600 case Instruction::PtrToInt:
14601 case Instruction::IntToPtr:
14602 case Instruction::SIToFP:
14603 case Instruction::UIToFP:
14604 case Instruction::Trunc:
14605 case Instruction::FPTrunc:
14606 case Instruction::BitCast: {
14607 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
14610 unsigned Opcode = ShuffleOrOp;
14611 unsigned VecOpcode = Opcode;
14613 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14615 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14616 if (SrcIt != MinBWs.end()) {
14617 SrcBWSz = SrcIt->second.first;
14623 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
14624 if (BWSz == SrcBWSz) {
14625 VecOpcode = Instruction::BitCast;
14626 }
else if (BWSz < SrcBWSz) {
14627 VecOpcode = Instruction::Trunc;
14628 }
else if (It != MinBWs.end()) {
14629 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14630 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14631 }
else if (SrcIt != MinBWs.end()) {
14632 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14634 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14636 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14637 !SrcIt->second.second) {
14638 VecOpcode = Instruction::UIToFP;
14641 assert(Idx == 0 &&
"Expected 0 index only");
14642 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
14649 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14651 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14654 bool IsArithmeticExtendedReduction =
14655 E->Idx == 0 && UserIgnoreList &&
14658 return is_contained({Instruction::Add, Instruction::FAdd,
14659 Instruction::Mul, Instruction::FMul,
14660 Instruction::And, Instruction::Or,
14664 if (IsArithmeticExtendedReduction &&
14665 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14667 return CommonCost +
14668 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
14669 VecOpcode == Opcode ? VI :
nullptr);
14671 return GetCostDiff(GetScalarCost, GetVectorCost);
14673 case Instruction::FCmp:
14674 case Instruction::ICmp:
14675 case Instruction::Select: {
14676 CmpPredicate VecPred, SwappedVecPred;
14679 match(VL0, MatchCmp))
14685 auto GetScalarCost = [&](
unsigned Idx) {
14695 !
match(VI, MatchCmp)) ||
14703 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14704 CostKind, getOperandInfo(
VI->getOperand(0)),
14705 getOperandInfo(
VI->getOperand(1)), VI);
14716 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
14717 CostKind, getOperandInfo(
E->getOperand(0)),
14718 getOperandInfo(
E->getOperand(1)), VL0);
14722 unsigned CondNumElements = CondType->getNumElements();
14724 assert(VecTyNumElements >= CondNumElements &&
14725 VecTyNumElements % CondNumElements == 0 &&
14726 "Cannot vectorize Instruction::Select");
14727 if (CondNumElements != VecTyNumElements) {
14736 return VecCost + CommonCost;
14738 return GetCostDiff(GetScalarCost, GetVectorCost);
14740 case TreeEntry::MinMax: {
14741 auto GetScalarCost = [&](
unsigned Idx) {
14742 return GetMinMaxCost(OrigScalarTy);
14746 return VecCost + CommonCost;
14748 return GetCostDiff(GetScalarCost, GetVectorCost);
14750 case TreeEntry::FMulAdd: {
14751 auto GetScalarCost = [&](
unsigned Idx) {
14754 return GetFMulAddCost(
E->getOperations(),
14760 for (
Value *V :
E->Scalars) {
14762 FMF &= FPCI->getFastMathFlags();
14764 FMF &= FPCIOp->getFastMathFlags();
14767 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14768 {VecTy, VecTy, VecTy}, FMF);
14770 return VecCost + CommonCost;
14772 return GetCostDiff(GetScalarCost, GetVectorCost);
14774 case Instruction::FNeg:
14775 case Instruction::Add:
14776 case Instruction::FAdd:
14777 case Instruction::Sub:
14778 case Instruction::FSub:
14779 case Instruction::Mul:
14780 case Instruction::FMul:
14781 case Instruction::UDiv:
14782 case Instruction::SDiv:
14783 case Instruction::FDiv:
14784 case Instruction::URem:
14785 case Instruction::SRem:
14786 case Instruction::FRem:
14787 case Instruction::Shl:
14788 case Instruction::LShr:
14789 case Instruction::AShr:
14790 case Instruction::And:
14791 case Instruction::Or:
14792 case Instruction::Xor: {
14793 auto GetScalarCost = [&](
unsigned Idx) {
14800 Value *Op1 =
E->getOperand(0)[Idx];
14802 SmallVector<const Value *, 2>
Operands(1, Op1);
14806 Op2 =
E->getOperand(1)[Idx];
14814 I && (ShuffleOrOp == Instruction::FAdd ||
14815 ShuffleOrOp == Instruction::FSub)) {
14823 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14828 return CI && CI->getValue().countr_one() >= It->second.first;
14836 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
14837 Op2Info, {},
nullptr, TLI) +
14840 return GetCostDiff(GetScalarCost, GetVectorCost);
14842 case Instruction::GetElementPtr: {
14843 return CommonCost + GetGEPCostDiff(VL, VL0);
14845 case Instruction::Load: {
14846 auto GetScalarCost = [&](
unsigned Idx) {
14848 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14849 VI->getAlign(),
VI->getPointerAddressSpace(),
14855 switch (
E->State) {
14856 case TreeEntry::Vectorize:
14857 if (
unsigned Factor =
E->getInterleaveFactor()) {
14858 VecLdCost = TTI->getInterleavedMemoryOpCost(
14859 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14860 LI0->getPointerAddressSpace(),
CostKind);
14863 VecLdCost = TTI->getMemoryOpCost(
14864 Instruction::Load, VecTy, LI0->getAlign(),
14868 case TreeEntry::StridedVectorize: {
14869 Align CommonAlignment =
14871 VecLdCost = TTI->getStridedMemoryOpCost(
14872 Instruction::Load, VecTy, LI0->getPointerOperand(),
14873 false, CommonAlignment,
CostKind);
14876 case TreeEntry::CompressVectorize: {
14878 unsigned InterleaveFactor;
14879 SmallVector<int> CompressMask;
14882 if (!
E->ReorderIndices.empty()) {
14883 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
14884 E->ReorderIndices.end());
14891 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
14892 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
14893 CompressMask, LoadVecTy);
14894 assert(IsVectorized &&
"Failed to vectorize load");
14895 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
14896 InterleaveFactor, IsMasked);
14897 Align CommonAlignment = LI0->getAlign();
14898 if (InterleaveFactor) {
14899 VecLdCost = TTI->getInterleavedMemoryOpCost(
14900 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14901 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
14902 }
else if (IsMasked) {
14903 VecLdCost = TTI->getMaskedMemoryOpCost(
14904 Instruction::Load, LoadVecTy, CommonAlignment,
14905 LI0->getPointerAddressSpace(),
CostKind);
14908 LoadVecTy, CompressMask,
CostKind);
14910 VecLdCost = TTI->getMemoryOpCost(
14911 Instruction::Load, LoadVecTy, CommonAlignment,
14915 LoadVecTy, CompressMask,
CostKind);
14919 case TreeEntry::ScatterVectorize: {
14920 Align CommonAlignment =
14922 VecLdCost = TTI->getGatherScatterOpCost(
14923 Instruction::Load, VecTy, LI0->getPointerOperand(),
14924 false, CommonAlignment,
CostKind);
14927 case TreeEntry::CombinedVectorize:
14928 case TreeEntry::SplitVectorize:
14929 case TreeEntry::NeedToGather:
14932 return VecLdCost + CommonCost;
14938 if (
E->State == TreeEntry::ScatterVectorize)
14945 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14947 case Instruction::Store: {
14948 bool IsReorder = !
E->ReorderIndices.empty();
14949 auto GetScalarCost = [=](
unsigned Idx) {
14952 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
14953 VI->getAlign(),
VI->getPointerAddressSpace(),
14961 if (
E->State == TreeEntry::StridedVectorize) {
14962 Align CommonAlignment =
14964 VecStCost = TTI->getStridedMemoryOpCost(
14965 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
14966 false, CommonAlignment,
CostKind);
14968 assert(
E->State == TreeEntry::Vectorize &&
14969 "Expected either strided or consecutive stores.");
14970 if (
unsigned Factor =
E->getInterleaveFactor()) {
14971 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
14972 "No reused shuffles expected");
14974 VecStCost = TTI->getInterleavedMemoryOpCost(
14975 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
14976 BaseSI->getPointerAddressSpace(),
CostKind);
14979 VecStCost = TTI->getMemoryOpCost(
14980 Instruction::Store, VecTy, BaseSI->getAlign(),
14981 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
14984 return VecStCost + CommonCost;
14988 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
14992 return GetCostDiff(GetScalarCost, GetVectorCost) +
14993 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
14995 case Instruction::Call: {
14996 auto GetScalarCost = [&](
unsigned Idx) {
15000 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15001 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15011 CI,
ID, VecTy->getNumElements(),
15012 It != MinBWs.end() ? It->second.first : 0, TTI);
15014 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15016 return GetCostDiff(GetScalarCost, GetVectorCost);
15018 case Instruction::ShuffleVector: {
15026 "Invalid Shuffle Vector Operand");
15029 auto TryFindNodeWithEqualOperands = [=]() {
15030 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15033 if (
TE->hasState() &&
TE->isAltShuffle() &&
15034 ((
TE->getOpcode() ==
E->getOpcode() &&
15035 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15036 (
TE->getOpcode() ==
E->getAltOpcode() &&
15037 TE->getAltOpcode() ==
E->getOpcode())) &&
15038 TE->hasEqualOperands(*
E))
15043 auto GetScalarCost = [&](
unsigned Idx) {
15048 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15049 "Unexpected main/alternate opcode");
15051 return TTI->getInstructionCost(VI,
CostKind);
15059 if (TryFindNodeWithEqualOperands()) {
15061 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15068 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15070 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15073 VecCost = TTIRef.getCmpSelInstrCost(
15074 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15075 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15077 VecCost += TTIRef.getCmpSelInstrCost(
15078 E->getOpcode(), VecTy, MaskTy,
15080 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15083 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15086 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15087 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15089 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15090 if (SrcIt != MinBWs.end()) {
15091 SrcBWSz = SrcIt->second.first;
15095 if (BWSz <= SrcBWSz) {
15096 if (BWSz < SrcBWSz)
15098 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15102 <<
"SLP: alternate extension, which should be truncated.\n";
15108 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15111 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15114 SmallVector<int>
Mask;
15115 E->buildAltOpShuffleMask(
15116 [&](Instruction *
I) {
15117 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15118 "Unexpected main/alternate opcode");
15129 unsigned Opcode0 =
E->getOpcode();
15130 unsigned Opcode1 =
E->getAltOpcode();
15131 SmallBitVector OpcodeMask(
15135 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15137 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15138 return AltVecCost < VecCost ? AltVecCost : VecCost;
15144 return GetCostDiff(
15149 "Not supported shufflevector usage.");
15151 unsigned SVNumElements =
15153 ->getNumElements();
15154 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15155 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15160 "Not supported shufflevector usage.");
15163 [[maybe_unused]]
bool IsExtractSubvectorMask =
15164 SV->isExtractSubvectorMask(Index);
15165 assert(IsExtractSubvectorMask &&
15166 "Not supported shufflevector usage.");
15167 if (NextIndex != Index)
15169 NextIndex += SV->getShuffleMask().size();
15172 return ::getShuffleCost(
15178 return GetCostDiff(GetScalarCost, GetVectorCost);
15180 case Instruction::Freeze:
15187bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15189 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15191 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15192 SmallVector<int>
Mask;
15193 return TE->isGather() &&
15195 [
this](
Value *V) { return EphValues.contains(V); }) &&
15197 TE->Scalars.size() < Limit ||
15198 (((
TE->hasState() &&
15199 TE->getOpcode() == Instruction::ExtractElement) ||
15202 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15203 !
TE->isAltShuffle()) ||
15208 if (VectorizableTree.size() == 1 &&
15209 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15210 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15211 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15213 AreVectorizableGathers(VectorizableTree[0].
get(),
15214 VectorizableTree[0]->Scalars.size()) &&
15215 VectorizableTree[0]->getVectorFactor() > 2)))
15218 if (VectorizableTree.size() != 2)
15225 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15226 AreVectorizableGathers(VectorizableTree[1].
get(),
15227 VectorizableTree[0]->Scalars.size()))
15231 if (VectorizableTree[0]->
isGather() ||
15232 (VectorizableTree[1]->
isGather() &&
15233 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15234 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15235 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15243 bool MustMatchOrInst) {
15247 Value *ZextLoad = Root;
15248 const APInt *ShAmtC;
15249 bool FoundOr =
false;
15253 ShAmtC->
urem(8) == 0))) {
15255 ZextLoad = BinOp->getOperand(0);
15256 if (BinOp->getOpcode() == Instruction::Or)
15261 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15268 Type *SrcTy = Load->getType();
15269 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15275 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15285 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15286 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15294 unsigned NumElts = Stores.
size();
15295 for (
Value *Scalar : Stores) {
15309 if (VectorizableTree.empty()) {
15310 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15316 if (VectorizableTree.size() == 2 &&
15318 VectorizableTree[1]->isGather() &&
15319 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15320 !(
isSplat(VectorizableTree[1]->Scalars) ||
15328 constexpr int Limit = 4;
15330 !VectorizableTree.empty() &&
15331 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15332 return (TE->isGather() &&
15333 (!TE->hasState() ||
15334 TE->getOpcode() != Instruction::ExtractElement) &&
15336 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15343 VectorizableTree.size() <= Limit &&
15344 all_of(VectorizableTree,
15345 [&](
const std::unique_ptr<TreeEntry> &TE) {
15346 return (TE->isGather() &&
15347 (!TE->hasState() ||
15348 TE->getOpcode() != Instruction::ExtractElement) &&
15352 (TE->getOpcode() == Instruction::InsertElement ||
15353 (TE->getOpcode() == Instruction::PHI &&
15355 return isa<PoisonValue>(V) || MustGather.contains(V);
15358 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15359 return TE->State == TreeEntry::Vectorize &&
15360 TE->getOpcode() == Instruction::PHI;
15367 unsigned NumGathers = 0;
15368 constexpr int LimitTreeSize = 36;
15370 all_of(VectorizableTree,
15371 [&](
const std::unique_ptr<TreeEntry> &TE) {
15372 if (!TE->isGather() && TE->hasState() &&
15373 (TE->getOpcode() == Instruction::Load ||
15374 TE->getOpcode() == Instruction::Store)) {
15378 if (TE->isGather())
15380 return TE->State == TreeEntry::SplitVectorize ||
15381 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15382 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15383 VectorizableTree.size() > LimitTreeSize) ||
15387 (TE->getOpcode() == Instruction::PHI ||
15388 (TE->hasCopyableElements() &&
15391 TE->Scalars.size() / 2) ||
15392 ((!TE->ReuseShuffleIndices.empty() ||
15393 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15394 TE->Scalars.size() == 2)));
15396 (StoreLoadNodes.
empty() ||
15397 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15398 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15399 return TE->getOpcode() == Instruction::Store ||
15401 return !isa<LoadInst>(V) ||
15402 areAllUsersVectorized(cast<Instruction>(V));
15410 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15411 VectorizableTree.size() >= Limit &&
15413 [&](
const std::unique_ptr<TreeEntry> &TE) {
15414 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15415 TE->UserTreeIndex.UserTE->Idx == 0;
15422 VectorizableTree.size() > 2 &&
15423 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15424 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15425 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15426 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15428 ArrayRef(VectorizableTree).drop_front(2),
15429 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15439 if (isFullyVectorizableTinyTree(ForReduction))
15444 bool IsAllowedSingleBVNode =
15445 VectorizableTree.
size() > 1 ||
15446 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15447 !VectorizableTree.front()->isAltShuffle() &&
15448 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15449 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15451 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15452 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15453 return isa<ExtractElementInst, Constant>(V) ||
15454 (IsAllowedSingleBVNode &&
15455 !V->hasNUsesOrMore(UsesLimit) &&
15456 any_of(V->users(), IsaPred<InsertElementInst>));
15461 if (VectorizableTree.back()->isGather() &&
15462 VectorizableTree.back()->hasState() &&
15463 VectorizableTree.back()->isAltShuffle() &&
15464 VectorizableTree.back()->getVectorFactor() > 2 &&
15466 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15467 TTI->getScalarizationOverhead(
15468 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15469 VectorizableTree.back()->getVectorFactor()),
15482 constexpr unsigned SmallTree = 3;
15483 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15486 [](
const std::unique_ptr<TreeEntry> &TE) {
15487 return TE->isGather() && TE->hasState() &&
15488 TE->getOpcode() == Instruction::Load &&
15496 TreeEntry &E = *VectorizableTree[Idx];
15497 if (E.State == TreeEntry::SplitVectorize)
15501 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15520 const TreeEntry *Root = VectorizableTree.front().get();
15521 if (Root->isGather())
15529 for (
const auto &TEPtr : VectorizableTree) {
15530 if (!TEPtr->isGather()) {
15531 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15532 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15533 LastInstructions.
insert(LastInst);
15535 if (TEPtr->UserTreeIndex)
15536 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15543 if (
II->isAssumeLikeIntrinsic())
15550 return IntrCost < CallCost;
15557 CheckedInstructions;
15558 unsigned Budget = 0;
15559 const unsigned BudgetLimit =
15564 "Expected instructions in same block.");
15565 if (
auto It = CheckedInstructions.
find(
Last);
15566 It != CheckedInstructions.
end()) {
15567 const Instruction *Checked = It->second.getPointer();
15569 return It->second.getInt() != 0;
15575 ++
First->getIterator().getReverse(),
15577 Last->getIterator().getReverse();
15579 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15585 for (
const Instruction *LastInst : LastInstsInRange)
15586 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15589 if (LastInstructions.
contains(&*PrevInstIt))
15590 LastInstsInRange.
push_back(&*PrevInstIt);
15595 for (
const Instruction *LastInst : LastInstsInRange)
15597 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15598 Budget <= BudgetLimit ? 1 : 0);
15599 return Budget <= BudgetLimit;
15601 auto AddCosts = [&](
const TreeEntry *
Op) {
15602 Type *ScalarTy =
Op->Scalars.front()->getType();
15603 auto It = MinBWs.find(
Op);
15604 if (It != MinBWs.end())
15607 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15610 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15617 ParentOpParentToPreds;
15620 auto Key = std::make_pair(Root, OpParent);
15621 if (
auto It = ParentOpParentToPreds.
find(
Key);
15622 It != ParentOpParentToPreds.
end())
15634 for (
const auto &KeyPair : ParentsPairsToAdd) {
15636 "Should not have been added before.");
15640 while (!Worklist.
empty()) {
15642 if (BB == OpParent || !Visited.
insert(BB).second)
15644 auto Pair = std::make_pair(BB, OpParent);
15645 if (
auto It = ParentOpParentToPreds.
find(Pair);
15646 It != ParentOpParentToPreds.
end()) {
15650 ParentsPairsToAdd.
insert(Pair);
15655 if (Budget > BudgetLimit)
15667 while (!LiveEntries.
empty()) {
15672 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15675 if (!
Op->isGather())
15677 if (Entry->State == TreeEntry::SplitVectorize ||
15678 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15684 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15687 if (
Op->isGather()) {
15688 assert(Entry->getOpcode() == Instruction::PHI &&
15689 "Expected phi node only.");
15691 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15693 for (
Value *V :
Op->Scalars) {
15704 OpLastInst = EntriesToLastInstruction.
at(
Op);
15708 if (OpParent == Parent) {
15709 if (Entry->getOpcode() == Instruction::PHI) {
15710 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15714 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15720 if (Entry->getOpcode() != Instruction::PHI &&
15721 !CheckForNonVecCallsInSameBlock(
15722 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
15728 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15734 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15750 const auto *I1 = IE1;
15751 const auto *I2 = IE2;
15763 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15766 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
15769 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15776struct ValueSelect {
15777 template <
typename U>
15778 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
15781 template <
typename U>
15782 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
15800template <
typename T>
15806 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
15808 auto VMIt = std::next(ShuffleMask.begin());
15811 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15813 if (!IsBaseUndef.
all()) {
15815 std::pair<T *, bool> Res =
15816 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
15818 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15822 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15824 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
15825 assert((!V || GetVF(V) == Mask.size()) &&
15826 "Expected base vector of VF number of elements.");
15827 Prev = Action(Mask, {
nullptr, Res.first});
15828 }
else if (ShuffleMask.size() == 1) {
15831 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15837 Prev = Action(Mask, {ShuffleMask.begin()->first});
15841 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15842 unsigned Vec2VF = GetVF(VMIt->first);
15843 if (Vec1VF == Vec2VF) {
15847 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15850 Mask[
I] = SecMask[
I] + Vec1VF;
15853 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15856 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15858 std::pair<T *, bool> Res2 =
15859 ResizeAction(VMIt->first, VMIt->second,
false);
15861 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15868 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
15871 Prev = Action(Mask, {Res1.first, Res2.first});
15873 VMIt = std::next(VMIt);
15875 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
15877 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
15879 std::pair<T *, bool> Res =
15880 ResizeAction(VMIt->first, VMIt->second,
false);
15882 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15885 "Multiple uses of scalars.");
15886 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
15891 Prev = Action(Mask, {Prev, Res.first});
15899template <
typename T>
struct ShuffledInsertData {
15903 MapVector<T, SmallVector<int>> ValueMasks;
15911 << VectorizableTree.size() <<
".\n");
15914 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
15915 TreeEntry &TE = *VectorizableTree[
I];
15918 if (TE.State == TreeEntry::CombinedVectorize) {
15920 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
15921 << *TE.Scalars[0] <<
".\n";
15922 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15925 if (TE.hasState() &&
15926 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15927 if (
const TreeEntry *E =
15928 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15929 E && E->getVectorFactor() == TE.getVectorFactor()) {
15934 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15941 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15942 "Expected gather nodes with users only.");
15948 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15952 none_of(ExternalUses, [](
const ExternalUser &EU) {
15963 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
15970 for (ExternalUser &EU : ExternalUses) {
15971 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
15974 for (ExternalUser &EU : ExternalUses) {
15975 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
15976 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
15978 else dbgs() <<
" User: nullptr\n");
15979 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
15984 if (EphValues.count(EU.User))
15988 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
15990 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
15998 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16004 !ExtractCostCalculated.
insert(EU.Scalar).second)
16017 if (!UsedInserts.
insert(VU).second)
16021 const TreeEntry *ScalarTE = &EU.E;
16024 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16029 Value *Op0 =
II->getOperand(0);
16036 if (It == ShuffledInserts.
end()) {
16038 Data.InsertElements.emplace_back(VU);
16040 VecId = ShuffledInserts.
size() - 1;
16041 auto It = MinBWs.find(ScalarTE);
16042 if (It != MinBWs.end() &&
16044 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16046 unsigned BWSz = It->second.first;
16047 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16048 unsigned VecOpcode;
16049 if (DstBWSz < BWSz)
16050 VecOpcode = Instruction::Trunc;
16053 It->second.second ? Instruction::SExt : Instruction::ZExt;
16058 FTy->getNumElements()),
16061 <<
" for extending externally used vector with "
16062 "non-equal minimum bitwidth.\n");
16067 It->InsertElements.front() = VU;
16068 VecId = std::distance(ShuffledInserts.
begin(), It);
16070 int InIdx = *InsertIdx;
16072 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16075 Mask[InIdx] = EU.Lane;
16076 DemandedElts[VecId].setBit(InIdx);
16087 auto *ScalarTy = EU.Scalar->getType();
16088 const unsigned BundleWidth = EU.E.getVectorFactor();
16089 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16091 const TreeEntry *Entry = &EU.E;
16092 auto It = MinBWs.find(Entry);
16093 if (It != MinBWs.end()) {
16098 ? Instruction::ZExt
16099 : Instruction::SExt;
16104 << ExtraCost <<
"\n");
16108 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16109 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16110 << *VecTy <<
": " << ExtraCost <<
"\n");
16113 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16114 Entry->getOpcode() == Instruction::Load) {
16116 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16119 const Loop *L = LI->getLoopFor(Phi->getParent());
16120 return L && (Phi->getParent() ==
I->getParent() ||
16121 L == LI->getLoopFor(
I->getParent()));
16125 if (!ValueToExtUses) {
16126 ValueToExtUses.emplace();
16127 for (
const auto &
P :
enumerate(ExternalUses)) {
16129 if (IsPhiInLoop(
P.value()))
16132 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16139 auto OperandIsScalar = [&](
Value *V) {
16145 return !EE->hasOneUse() || !MustGather.contains(EE);
16148 return ValueToExtUses->contains(V);
16150 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16151 bool CanBeUsedAsScalarCast =
false;
16154 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16159 if (ScalarCost + OpCost <= ExtraCost) {
16160 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16161 ScalarCost += OpCost;
16165 if (CanBeUsedAsScalar) {
16166 bool KeepScalar = ScalarCost <= ExtraCost;
16170 bool IsProfitablePHIUser =
16172 VectorizableTree.front()->Scalars.size() > 2)) &&
16173 VectorizableTree.front()->hasState() &&
16174 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16178 auto *PHIUser = dyn_cast<PHINode>(U);
16179 return (!PHIUser ||
16180 PHIUser->getParent() !=
16182 VectorizableTree.front()->getMainOp())
16187 return ValueToExtUses->contains(V);
16189 if (IsProfitablePHIUser) {
16193 (!GatheredLoadsEntriesFirst.has_value() ||
16194 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16195 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16196 return ValueToExtUses->contains(V);
16198 auto It = ExtractsCount.
find(Entry);
16199 if (It != ExtractsCount.
end()) {
16200 assert(ScalarUsesCount >= It->getSecond().size() &&
16201 "Expected total number of external uses not less than "
16202 "number of scalar uses.");
16203 ScalarUsesCount -= It->getSecond().size();
16208 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16211 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16212 for (
Value *V : Inst->operands()) {
16213 auto It = ValueToExtUses->find(V);
16214 if (It != ValueToExtUses->end()) {
16216 ExternalUses[It->second].User =
nullptr;
16219 ExtraCost = ScalarCost;
16220 if (!IsPhiInLoop(EU))
16221 ExtractsCount[Entry].
insert(Inst);
16222 if (CanBeUsedAsScalarCast) {
16223 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16227 for (
Value *V : IOp->operands()) {
16228 auto It = ValueToExtUses->find(V);
16229 if (It != ValueToExtUses->end()) {
16231 ExternalUses[It->second].User =
nullptr;
16240 ExtractCost += ExtraCost;
16244 for (
Value *V : ScalarOpsFromCasts) {
16245 ExternalUsesAsOriginalScalar.insert(V);
16247 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16248 TEs.front()->findLaneForValue(V));
16252 if (!VectorizedVals.
empty()) {
16253 const TreeEntry &Root = *VectorizableTree.front();
16254 auto BWIt = MinBWs.find(&Root);
16255 if (BWIt != MinBWs.end()) {
16256 Type *DstTy = Root.Scalars.front()->getType();
16257 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
16259 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16260 if (OriginalSz != SrcSz) {
16261 unsigned Opcode = Instruction::Trunc;
16262 if (OriginalSz > SrcSz)
16263 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16269 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16276 Cost += ExtractCost;
16278 bool ForSingleMask) {
16280 unsigned VF = Mask.size();
16281 unsigned VecVF = TE->getVectorFactor();
16282 bool HasLargeIndex =
16283 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16284 if ((VF != VecVF && HasLargeIndex) ||
16287 if (HasLargeIndex) {
16289 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16295 dbgs() <<
"SLP: Adding cost " <<
C
16296 <<
" for final shuffle of insertelement external users.\n";
16297 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16299 return std::make_pair(TE,
true);
16302 if (!ForSingleMask) {
16304 for (
unsigned I = 0;
I < VF; ++
I) {
16306 ResizeMask[Mask[
I]] = Mask[
I];
16313 dbgs() <<
"SLP: Adding cost " <<
C
16314 <<
" for final shuffle of insertelement external users.\n";
16315 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16320 return std::make_pair(TE,
false);
16323 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16324 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16325 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16329 assert((TEs.size() == 1 || TEs.size() == 2) &&
16330 "Expected exactly 1 or 2 tree entries.");
16331 if (TEs.size() == 1) {
16333 VF = TEs.front()->getVectorFactor();
16334 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16338 (
Data.index() < VF &&
16339 static_cast<int>(
Data.index()) ==
Data.value());
16344 <<
" for final shuffle of insertelement "
16345 "external users.\n";
16346 TEs.front()->
dump();
16347 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16353 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16354 VF = TEs.front()->getVectorFactor();
16358 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16362 <<
" for final shuffle of vector node and external "
16363 "insertelement users.\n";
16364 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16365 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16373 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16374 EstimateShufflesCost);
16377 ShuffledInserts[
I].InsertElements.
front()->getType()),
16380 Cost -= InsertCost;
16384 if (ReductionBitWidth != 0) {
16385 assert(UserIgnoreList &&
"Expected reduction tree.");
16386 const TreeEntry &E = *VectorizableTree.front();
16387 auto It = MinBWs.find(&E);
16388 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16389 unsigned SrcSize = It->second.first;
16390 unsigned DstSize = ReductionBitWidth;
16391 unsigned Opcode = Instruction::Trunc;
16392 if (SrcSize < DstSize) {
16393 bool IsArithmeticExtendedReduction =
16396 return is_contained({Instruction::Add, Instruction::FAdd,
16397 Instruction::Mul, Instruction::FMul,
16398 Instruction::And, Instruction::Or,
16402 if (IsArithmeticExtendedReduction)
16404 Instruction::BitCast;
16406 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16408 if (Opcode != Instruction::BitCast) {
16410 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16412 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16415 switch (E.getOpcode()) {
16416 case Instruction::SExt:
16417 case Instruction::ZExt:
16418 case Instruction::Trunc: {
16419 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16420 CCH = getCastContextHint(*OpTE);
16426 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16430 <<
" for final resize for reduction from " << SrcVecTy
16431 <<
" to " << DstVecTy <<
"\n";
16432 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16437 std::optional<InstructionCost> SpillCost;
16440 Cost += *SpillCost;
16446 OS <<
"SLP: Spill Cost = ";
16451 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16452 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
16456 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
16467std::optional<TTI::ShuffleKind>
16468BoUpSLP::tryToGatherSingleRegisterExtractElements(
16474 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16490 if (Idx >= VecTy->getNumElements()) {
16494 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
16495 ExtractMask.reset(*Idx);
16500 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16505 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16506 return P1.second.size() > P2.second.size();
16509 const int UndefSz = UndefVectorExtracts.
size();
16510 unsigned SingleMax = 0;
16511 unsigned PairMax = 0;
16512 if (!Vectors.
empty()) {
16513 SingleMax = Vectors.
front().second.size() + UndefSz;
16514 if (Vectors.
size() > 1) {
16515 auto *ItNext = std::next(Vectors.
begin());
16516 PairMax = SingleMax + ItNext->second.size();
16519 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16520 return std::nullopt;
16526 if (SingleMax >= PairMax && SingleMax) {
16527 for (
int Idx : Vectors.
front().second)
16528 std::swap(GatheredExtracts[Idx], VL[Idx]);
16529 }
else if (!Vectors.
empty()) {
16530 for (
unsigned Idx : {0, 1})
16531 for (
int Idx : Vectors[Idx].second)
16532 std::swap(GatheredExtracts[Idx], VL[Idx]);
16535 for (
int Idx : UndefVectorExtracts)
16536 std::swap(GatheredExtracts[Idx], VL[Idx]);
16539 std::optional<TTI::ShuffleKind> Res =
16545 return std::nullopt;
16549 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
16570BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16571 SmallVectorImpl<int> &Mask,
16572 unsigned NumParts)
const {
16573 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16582 SmallVector<int> SubMask;
16583 std::optional<TTI::ShuffleKind> Res =
16584 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16585 ShufflesRes[Part] = Res;
16586 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16588 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16589 return Res.has_value();
16591 ShufflesRes.clear();
16592 return ShufflesRes;
16595std::optional<TargetTransformInfo::ShuffleKind>
16596BoUpSLP::isGatherShuffledSingleRegisterEntry(
16598 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
16602 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16603 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16604 TE =
TE->UserTreeIndex.UserTE;
16605 if (TE == VectorizableTree.front().get())
16606 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16607 return TE->UserTreeIndex;
16609 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16610 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16611 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16613 TE =
TE->UserTreeIndex.UserTE;
16617 const EdgeInfo TEUseEI = GetUserEntry(TE);
16619 return std::nullopt;
16620 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16625 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16626 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16627 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16630 TEInsertBlock = TEInsertPt->
getParent();
16632 if (!DT->isReachableFromEntry(TEInsertBlock))
16633 return std::nullopt;
16634 auto *NodeUI = DT->getNode(TEInsertBlock);
16635 assert(NodeUI &&
"Should only process reachable instructions");
16637 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16650 const BasicBlock *InsertBlock = InsertPt->getParent();
16651 auto *NodeEUI = DT->getNode(InsertBlock);
16654 assert((NodeUI == NodeEUI) ==
16655 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16656 "Different nodes should have different DFS numbers");
16658 if (TEInsertPt->
getParent() != InsertBlock &&
16659 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16661 if (TEInsertPt->
getParent() == InsertBlock &&
16674 SmallDenseMap<Value *, int> UsedValuesEntry;
16675 SmallPtrSet<const Value *, 16> VisitedValue;
16676 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16678 if ((TEPtr->getVectorFactor() != VL.
size() &&
16679 TEPtr->Scalars.size() != VL.
size()) ||
16680 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16684 for (
Value *V : VL) {
16691 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16692 unsigned EdgeIdx) {
16693 const TreeEntry *Ptr1 = User1;
16694 const TreeEntry *Ptr2 = User2;
16695 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16698 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16699 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16702 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16703 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16704 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16705 return Idx < It->second;
16709 for (
Value *V : VL) {
16713 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16714 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16715 if (TEPtr == TE || TEPtr->Idx == 0)
16718 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
16719 "Must contain at least single gathered value.");
16720 assert(TEPtr->UserTreeIndex &&
16721 "Expected only single user of a gather node.");
16722 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16724 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16725 UseEI.UserTE->hasState())
16730 : &getLastInstructionInBundle(UseEI.UserTE);
16731 if (TEInsertPt == InsertPt) {
16733 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16734 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16735 TEUseEI.UserTE->isAltShuffle()) &&
16737 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16738 (UseEI.UserTE->hasState() &&
16739 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16740 !UseEI.UserTE->isAltShuffle()) ||
16749 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16752 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16753 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16754 UseEI.UserTE->State == TreeEntry::Vectorize &&
16755 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16756 TEUseEI.UserTE != UseEI.UserTE)
16761 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16765 if (TEUseEI.UserTE != UseEI.UserTE &&
16766 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16767 HasGatherUser(TEUseEI.UserTE)))
16770 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16774 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16775 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16776 UseEI.UserTE->doesNotNeedToSchedule() &&
16781 if ((TEInsertBlock != InsertPt->
getParent() ||
16782 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16783 !CheckOrdering(InsertPt))
16786 if (CheckAndUseSameNode(TEPtr))
16792 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
16793 if (It != VTEs.end()) {
16794 const TreeEntry *VTE = *It;
16795 if (
none_of(
TE->CombinedEntriesWithIndices,
16796 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16797 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16798 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16802 if (CheckAndUseSameNode(VTE))
16808 const TreeEntry *VTE = VTEs.front();
16809 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16810 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16811 VTEs = VTEs.drop_front();
16813 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
16814 return MTE->State == TreeEntry::Vectorize;
16816 if (MIt == VTEs.end())
16820 if (
none_of(
TE->CombinedEntriesWithIndices,
16821 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16822 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16823 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16827 if (CheckAndUseSameNode(VTE))
16831 if (VToTEs.
empty())
16833 if (UsedTEs.
empty()) {
16841 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16843 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16847 if (!VToTEs.
empty()) {
16853 VToTEs = SavedVToTEs;
16858 if (Idx == UsedTEs.
size()) {
16862 if (UsedTEs.
size() == 2)
16864 UsedTEs.push_back(SavedVToTEs);
16865 Idx = UsedTEs.
size() - 1;
16871 if (UsedTEs.
empty()) {
16873 return std::nullopt;
16877 if (UsedTEs.
size() == 1) {
16880 UsedTEs.front().
end());
16881 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16882 return TE1->Idx < TE2->Idx;
16885 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
16886 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
16888 if (It != FirstEntries.end() &&
16889 ((*It)->getVectorFactor() == VL.size() ||
16890 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
16891 TE->ReuseShuffleIndices.size() == VL.size() &&
16892 (*It)->isSame(
TE->Scalars)))) {
16894 if ((*It)->getVectorFactor() == VL.size()) {
16895 std::iota(std::next(
Mask.begin(), Part * VL.size()),
16896 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
16898 SmallVector<int> CommonMask =
TE->getCommonMask();
16909 Entries.
push_back(FirstEntries.front());
16911 for (
auto &
P : UsedValuesEntry)
16913 VF = FirstEntries.front()->getVectorFactor();
16916 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
16918 DenseMap<int, const TreeEntry *> VFToTE;
16919 for (
const TreeEntry *TE : UsedTEs.front()) {
16920 unsigned VF =
TE->getVectorFactor();
16921 auto It = VFToTE.
find(VF);
16922 if (It != VFToTE.
end()) {
16923 if (It->second->Idx >
TE->Idx)
16924 It->getSecond() =
TE;
16931 UsedTEs.back().
end());
16932 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16933 return TE1->Idx < TE2->Idx;
16935 for (
const TreeEntry *TE : SecondEntries) {
16936 auto It = VFToTE.
find(
TE->getVectorFactor());
16937 if (It != VFToTE.
end()) {
16946 if (Entries.
empty()) {
16948 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16949 return TE1->Idx < TE2->Idx;
16951 Entries.
push_back(SecondEntries.front());
16952 VF = std::max(Entries.
front()->getVectorFactor(),
16953 Entries.
back()->getVectorFactor());
16955 VF = Entries.
front()->getVectorFactor();
16958 for (
const TreeEntry *
E : Entries)
16962 for (
auto &
P : UsedValuesEntry) {
16964 if (ValuesToEntries[Idx].
contains(
P.first)) {
16974 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
16981 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
16983 Value *In1 = PHI1->getIncomingValue(
I);
16998 auto MightBeIgnored = [=](
Value *
V) {
17002 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17007 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17008 Value *V1 = VL[Idx];
17009 bool UsedInSameVTE =
false;
17010 auto It = UsedValuesEntry.find(V1);
17011 if (It != UsedValuesEntry.end())
17012 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17013 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17020 SmallBitVector UsedIdxs(Entries.size());
17022 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17024 auto It = UsedValuesEntry.find(V);
17025 if (It == UsedValuesEntry.end())
17031 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17032 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17034 unsigned Idx = It->second;
17041 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17042 if (!UsedIdxs.test(
I))
17048 for (std::pair<unsigned, int> &Pair : EntryLanes)
17049 if (Pair.first ==
I)
17050 Pair.first = TempEntries.
size();
17053 Entries.swap(TempEntries);
17054 if (EntryLanes.size() == Entries.size() &&
17056 .slice(Part * VL.size(),
17057 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17063 return std::nullopt;
17066 bool IsIdentity = Entries.size() == 1;
17069 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17070 unsigned Idx = Part * VL.size() + Pair.second;
17073 (ForOrder ? std::distance(
17074 Entries[Pair.first]->Scalars.begin(),
17075 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17076 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17077 IsIdentity &=
Mask[Idx] == Pair.second;
17079 if (ForOrder || IsIdentity || Entries.empty()) {
17080 switch (Entries.size()) {
17082 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17086 if (EntryLanes.size() > 2 || VL.size() <= 2)
17093 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17095 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17096 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17097 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17098 for (
int Idx : SubMask) {
17106 assert(MaxElement >= 0 && MinElement >= 0 &&
17107 MaxElement % VF >= MinElement % VF &&
17108 "Expected at least single element.");
17109 unsigned NewVF = std::max<unsigned>(
17111 (MaxElement % VF) -
17112 (MinElement % VF) + 1));
17114 for (
int &Idx : SubMask) {
17117 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17118 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17126 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17127 auto GetShuffleCost = [&,
17128 &TTI = *TTI](ArrayRef<int>
Mask,
17131 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17133 Mask, Entries.front()->getInterleaveFactor()))
17135 return ::getShuffleCost(TTI,
17140 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17142 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17143 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17144 FirstShuffleCost = ShuffleCost;
17148 bool IsIdentity =
true;
17149 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17150 if (Idx >=
static_cast<int>(NewVF)) {
17155 IsIdentity &=
static_cast<int>(
I) == Idx;
17159 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17161 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17165 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17166 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17167 SecondShuffleCost = ShuffleCost;
17171 bool IsIdentity =
true;
17172 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
17173 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
17179 IsIdentity &=
static_cast<int>(
I) == Idx;
17184 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17186 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17194 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17196 const TreeEntry *BestEntry =
nullptr;
17197 if (FirstShuffleCost < ShuffleCost) {
17198 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17199 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17201 if (Idx >= static_cast<int>(VF))
17202 Idx = PoisonMaskElem;
17204 BestEntry = Entries.front();
17205 ShuffleCost = FirstShuffleCost;
17207 if (SecondShuffleCost < ShuffleCost) {
17208 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17209 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17211 if (Idx < static_cast<int>(VF))
17212 Idx = PoisonMaskElem;
17216 BestEntry = Entries[1];
17217 ShuffleCost = SecondShuffleCost;
17219 if (BuildVectorCost >= ShuffleCost) {
17222 Entries.push_back(BestEntry);
17230 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17232 return std::nullopt;
17236BoUpSLP::isGatherShuffledEntry(
17240 assert(NumParts > 0 && NumParts < VL.
size() &&
17241 "Expected positive number of registers.");
17244 if (TE == VectorizableTree.front().get() &&
17245 (!GatheredLoadsEntriesFirst.has_value() ||
17247 [](
const std::unique_ptr<TreeEntry> &TE) {
17248 return !
TE->isGather();
17253 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17256 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17257 "Expected only single user of the gather node.");
17259 "Number of scalars must be divisible by NumParts.");
17260 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17261 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17263 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17266 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17273 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17274 std::optional<TTI::ShuffleKind> SubRes =
17275 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17278 SubEntries.
clear();
17281 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17282 (SubEntries.
front()->isSame(
TE->Scalars) ||
17283 SubEntries.
front()->isSame(VL))) {
17285 LocalSubEntries.
swap(SubEntries);
17288 std::iota(
Mask.begin(),
Mask.end(), 0);
17290 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17293 Entries.emplace_back(1, LocalSubEntries.
front());
17299 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17307 Type *ScalarTy)
const {
17308 const unsigned VF = VL.
size();
17316 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17318 if (
V->getType() != ScalarTy)
17319 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
17323 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17330 ConstantShuffleMask[
I] =
I + VF;
17333 EstimateInsertCost(
I, V);
17336 bool IsAnyNonUndefConst =
17339 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17341 ConstantShuffleMask);
17345 if (!DemandedElements.
isZero())
17349 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17353Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
17354 auto It = EntryToLastInstruction.find(
E);
17355 if (It != EntryToLastInstruction.end())
17363 if (
E->hasState()) {
17364 Front =
E->getMainOp();
17365 Opcode =
E->getOpcode();
17372 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17373 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
17374 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
17376 [=](
Value *V) ->
bool {
17377 if (Opcode == Instruction::GetElementPtr &&
17378 !isa<GetElementPtrInst>(V))
17380 auto *I = dyn_cast<Instruction>(V);
17381 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17382 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17384 "Expected gathered loads or GEPs or instructions from same basic "
17387 auto FindLastInst = [&]() {
17389 for (
Value *V :
E->Scalars) {
17393 if (
E->isCopyableElement(
I))
17395 if (LastInst->
getParent() ==
I->getParent()) {
17400 assert(((Opcode == Instruction::GetElementPtr &&
17402 E->State == TreeEntry::SplitVectorize ||
17405 (GatheredLoadsEntriesFirst.has_value() &&
17406 Opcode == Instruction::Load &&
E->isGather() &&
17407 E->Idx < *GatheredLoadsEntriesFirst)) &&
17408 "Expected vector-like or non-GEP in GEP node insts only.");
17409 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
17413 if (!DT->isReachableFromEntry(
I->getParent()))
17415 auto *NodeA = DT->getNode(LastInst->
getParent());
17416 auto *NodeB = DT->getNode(
I->getParent());
17417 assert(NodeA &&
"Should only process reachable instructions");
17418 assert(NodeB &&
"Should only process reachable instructions");
17419 assert((NodeA == NodeB) ==
17420 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17421 "Different nodes should have different DFS numbers");
17422 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17429 auto FindFirstInst = [&]() {
17431 for (
Value *V :
E->Scalars) {
17435 if (
E->isCopyableElement(
I))
17437 if (FirstInst->
getParent() ==
I->getParent()) {
17438 if (
I->comesBefore(FirstInst))
17442 assert(((Opcode == Instruction::GetElementPtr &&
17446 "Expected vector-like or non-GEP in GEP node insts only.");
17447 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
17451 if (!DT->isReachableFromEntry(
I->getParent()))
17453 auto *NodeA = DT->getNode(FirstInst->
getParent());
17454 auto *NodeB = DT->getNode(
I->getParent());
17455 assert(NodeA &&
"Should only process reachable instructions");
17456 assert(NodeB &&
"Should only process reachable instructions");
17457 assert((NodeA == NodeB) ==
17458 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17459 "Different nodes should have different DFS numbers");
17460 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17466 if (
E->State == TreeEntry::SplitVectorize) {
17467 Res = FindLastInst();
17469 for (
auto *
E : Entries) {
17472 I = &getLastInstructionInBundle(
E);
17477 EntryToLastInstruction.try_emplace(
E, Res);
17482 if (GatheredLoadsEntriesFirst.has_value() &&
17483 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17484 Opcode == Instruction::Load) {
17485 Res = FindFirstInst();
17486 EntryToLastInstruction.try_emplace(
E, Res);
17492 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
17496 const auto *It = BlocksSchedules.find(BB);
17497 if (It == BlocksSchedules.end())
17499 for (
Value *V :
E->Scalars) {
17505 if (Bundles.
empty())
17508 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
17509 if (It != Bundles.
end())
17514 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
17515 if (!
E->isGather() && !Bundle) {
17516 if ((Opcode == Instruction::GetElementPtr &&
17519 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17522 return isa<PoisonValue>(V) ||
17523 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17524 E->isCopyableElement(V) ||
17525 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17527 Res = FindLastInst();
17529 Res = FindFirstInst();
17530 EntryToLastInstruction.try_emplace(
E, Res);
17539 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
17540 Res = Bundle->getBundle().back()->getInst();
17541 EntryToLastInstruction.try_emplace(
E, Res);
17564 Res = FindLastInst();
17565 assert(Res &&
"Failed to find last instruction in bundle");
17566 EntryToLastInstruction.try_emplace(
E, Res);
17570void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
17571 auto *Front =
E->getMainOp();
17572 Instruction *LastInst = &getLastInstructionInBundle(
E);
17573 assert(LastInst &&
"Failed to find last instruction in bundle");
17578 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17579 if (LastInstIt != LastInst->
getParent()->end() &&
17580 LastInstIt->getParent()->isLandingPad())
17581 LastInstIt = std::next(LastInstIt);
17584 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
17585 E->doesNotNeedToSchedule()) ||
17586 (GatheredLoadsEntriesFirst.has_value() &&
17587 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17588 E->getOpcode() == Instruction::Load)) {
17589 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17593 Builder.SetInsertPoint(
17597 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17600Value *BoUpSLP::gather(
17602 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
17608 SmallSet<int, 4> PostponedIndices;
17609 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17611 SmallPtrSet<BasicBlock *, 4> Visited;
17612 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17613 InsertBB = InsertBB->getSinglePredecessor();
17614 return InsertBB && InsertBB == InstBB;
17616 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17618 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17620 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17621 PostponedIndices.
insert(
I).second)
17625 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17628 if (
Scalar->getType() != Ty) {
17639 Scalar = Builder.CreateIntCast(
17653 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17658 GatherShuffleExtractSeq.insert(InsElt);
17664 User *UserOp =
nullptr;
17669 if (
V->getType()->isVectorTy()) {
17671 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17673 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
17675 if (SV->getOperand(0) == V)
17677 if (SV->getOperand(1) == V)
17683 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17685 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17688 "Failed to find shufflevector, caused by resize.");
17694 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17695 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17703 SmallVector<int> NonConsts;
17705 std::iota(
Mask.begin(),
Mask.end(), 0);
17706 Value *OriginalRoot = Root;
17709 SV->getOperand(0)->getType() == VecTy) {
17710 Root = SV->getOperand(0);
17711 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17714 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17723 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17728 Vec = OriginalRoot;
17730 Vec = CreateShuffle(Root, Vec, Mask);
17732 OI && OI->use_empty() &&
17733 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17734 return TE->VectorizedValue == OI;
17740 for (
int I : NonConsts)
17741 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17744 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17745 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17783 bool IsFinalized =
false;
17796 class ShuffleIRBuilder {
17809 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17810 CSEBlocks(CSEBlocks),
DL(DL) {}
17811 ~ShuffleIRBuilder() =
default;
17817 "Expected integer vector types only.");
17823 ->getIntegerBitWidth())
17824 V2 = Builder.CreateIntCast(
17827 V1 = Builder.CreateIntCast(
17831 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17833 GatherShuffleExtractSeq.insert(
I);
17834 CSEBlocks.insert(
I->getParent());
17843 unsigned VF = Mask.size();
17847 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17849 GatherShuffleExtractSeq.insert(
I);
17850 CSEBlocks.insert(
I->getParent());
17854 Value *createIdentity(
Value *V) {
return V; }
17855 Value *createPoison(
Type *Ty,
unsigned VF) {
17860 void resizeToMatch(
Value *&V1,
Value *&V2) {
17865 int VF = std::max(V1VF, V2VF);
17866 int MinVF = std::min(V1VF, V2VF);
17868 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
17870 Value *&
Op = MinVF == V1VF ? V1 : V2;
17871 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
17873 GatherShuffleExtractSeq.insert(
I);
17874 CSEBlocks.insert(
I->getParent());
17887 assert(V1 &&
"Expected at least one vector value.");
17888 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17889 R.CSEBlocks, *R.DL);
17890 return BaseShuffleAnalysis::createShuffle<Value *>(
17891 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17897 std::optional<bool> IsSigned = std::nullopt) {
17900 if (VecTy->getElementType() == ScalarTy->getScalarType())
17902 return Builder.CreateIntCast(
17903 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
17907 Value *getVectorizedValue(
const TreeEntry &E) {
17908 Value *Vec = E.VectorizedValue;
17911 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
17912 return !isa<PoisonValue>(V) &&
17913 !isKnownNonNegative(
17914 V, SimplifyQuery(*R.DL));
17920 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17924 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17925 unsigned NumParts,
bool &UseVecBaseAsInput) {
17926 UseVecBaseAsInput =
false;
17928 Value *VecBase =
nullptr;
17930 if (!E->ReorderIndices.empty()) {
17932 E->ReorderIndices.end());
17935 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
17940 VecBase = EI->getVectorOperand();
17942 VecBase = TEs.front()->VectorizedValue;
17943 assert(VecBase &&
"Expected vectorized value.");
17944 UniqueBases.
insert(VecBase);
17947 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17948 (NumParts != 1 &&
count(VL, EI) > 1) ||
17950 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17951 return UTEs.empty() || UTEs.size() > 1 ||
17952 (isa<GetElementPtrInst>(U) &&
17953 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17955 count_if(R.VectorizableTree,
17956 [&](const std::unique_ptr<TreeEntry> &TE) {
17957 return TE->UserTreeIndex.UserTE ==
17959 is_contained(VL, EI);
17963 R.eraseInstruction(EI);
17965 if (NumParts == 1 || UniqueBases.
size() == 1) {
17966 assert(VecBase &&
"Expected vectorized value.");
17967 return castToScalarTyElem(VecBase);
17969 UseVecBaseAsInput =
true;
17979 Value *Vec =
nullptr;
17986 constexpr int MaxBases = 2;
17988 auto VLMask =
zip(SubVL, SubMask);
17989 const unsigned VF = std::accumulate(
17990 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
17991 if (std::get<1>(D) == PoisonMaskElem)
17994 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
17995 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
17997 VecOp = TEs.front()->VectorizedValue;
17998 assert(VecOp &&
"Expected vectorized value.");
17999 const unsigned Size =
18000 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18001 return std::max(S, Size);
18003 for (
const auto [V,
I] : VLMask) {
18008 VecOp = TEs.front()->VectorizedValue;
18009 assert(VecOp &&
"Expected vectorized value.");
18010 VecOp = castToScalarTyElem(VecOp);
18011 Bases[
I / VF] = VecOp;
18013 if (!Bases.front())
18016 if (Bases.back()) {
18017 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18018 TransformToIdentity(SubMask);
18020 SubVec = Bases.front();
18026 ArrayRef<int> SubMask =
18027 Mask.slice(
P * SliceSize,
18030 return all_of(SubMask, [](
int Idx) {
18034 "Expected first part or all previous parts masked.");
18035 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18040 unsigned SubVecVF =
18042 NewVF = std::max(NewVF, SubVecVF);
18045 for (
int &Idx : SubMask)
18048 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18049 Vec = createShuffle(Vec, SubVec, VecMask);
18050 TransformToIdentity(VecMask);
18058 std::optional<Value *>
18064 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18066 return std::nullopt;
18069 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18070 return Builder.CreateAlignedLoad(
18077 IsFinalized =
false;
18078 CommonMask.clear();
18084 Value *V1 = getVectorizedValue(E1);
18085 Value *V2 = getVectorizedValue(E2);
18091 Value *V1 = getVectorizedValue(E1);
18096 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18099 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18100 V1 = castToScalarTyElem(V1);
18101 V2 = castToScalarTyElem(V2);
18102 if (InVectors.empty()) {
18103 InVectors.push_back(V1);
18104 InVectors.push_back(V2);
18105 CommonMask.assign(Mask.begin(), Mask.end());
18108 Value *Vec = InVectors.front();
18109 if (InVectors.size() == 2) {
18110 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18111 transformMaskAfterShuffle(CommonMask, CommonMask);
18114 Vec = createShuffle(Vec,
nullptr, CommonMask);
18115 transformMaskAfterShuffle(CommonMask, CommonMask);
18117 V1 = createShuffle(V1, V2, Mask);
18118 unsigned VF = std::max(getVF(V1), getVF(Vec));
18119 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18121 CommonMask[Idx] = Idx + VF;
18122 InVectors.front() = Vec;
18123 if (InVectors.size() == 2)
18124 InVectors.back() = V1;
18126 InVectors.push_back(V1);
18131 "castToScalarTyElem expects V1 to be FixedVectorType");
18132 V1 = castToScalarTyElem(V1);
18133 if (InVectors.empty()) {
18134 InVectors.push_back(V1);
18135 CommonMask.assign(Mask.begin(), Mask.end());
18138 const auto *It =
find(InVectors, V1);
18139 if (It == InVectors.end()) {
18140 if (InVectors.size() == 2 ||
18141 InVectors.front()->getType() != V1->
getType()) {
18142 Value *V = InVectors.front();
18143 if (InVectors.size() == 2) {
18144 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18145 transformMaskAfterShuffle(CommonMask, CommonMask);
18147 CommonMask.size()) {
18148 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
18149 transformMaskAfterShuffle(CommonMask, CommonMask);
18151 unsigned VF = std::max(CommonMask.size(), Mask.size());
18152 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18154 CommonMask[Idx] = V->getType() != V1->
getType()
18156 : Mask[Idx] + getVF(V1);
18157 if (V->getType() != V1->
getType())
18158 V1 = createShuffle(V1,
nullptr, Mask);
18159 InVectors.front() = V;
18160 if (InVectors.size() == 2)
18161 InVectors.back() = V1;
18163 InVectors.push_back(V1);
18168 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18170 InVectors.push_back(V1);
18175 for (
Value *V : InVectors)
18176 VF = std::max(VF, getVF(V));
18177 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18179 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18188 Value *Root =
nullptr) {
18189 return R.gather(VL, Root, ScalarTy,
18191 return createShuffle(V1, V2, Mask);
18200 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18205 IsFinalized =
true;
18208 if (InVectors.
size() == 2) {
18209 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18212 Vec = createShuffle(Vec,
nullptr, CommonMask);
18214 transformMaskAfterShuffle(CommonMask, CommonMask);
18216 "Expected vector length for the final value before action.");
18220 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18221 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18224 return createShuffle(V1, V2, Mask);
18226 InVectors.
front() = Vec;
18228 if (!SubVectors.empty()) {
18230 if (InVectors.
size() == 2) {
18231 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18234 Vec = createShuffle(Vec,
nullptr, CommonMask);
18236 transformMaskAfterShuffle(CommonMask, CommonMask);
18237 auto CreateSubVectors = [&](
Value *Vec,
18238 SmallVectorImpl<int> &CommonMask) {
18239 for (
auto [
E, Idx] : SubVectors) {
18240 Value *
V = getVectorizedValue(*
E);
18247 Type *OrigScalarTy = ScalarTy;
18250 Builder, Vec, V, InsertionIndex,
18251 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18253 ScalarTy = OrigScalarTy;
18254 if (!CommonMask.
empty()) {
18255 std::iota(std::next(CommonMask.
begin(), Idx),
18256 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
18262 if (SubVectorsMask.
empty()) {
18263 Vec = CreateSubVectors(Vec, CommonMask);
18266 copy(SubVectorsMask, SVMask.begin());
18267 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18270 I1 = I2 + CommonMask.
size();
18275 Vec = createShuffle(InsertVec, Vec, SVMask);
18276 transformMaskAfterShuffle(CommonMask, SVMask);
18278 InVectors.
front() = Vec;
18281 if (!ExtMask.
empty()) {
18282 if (CommonMask.
empty()) {
18286 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18289 NewMask[
I] = CommonMask[ExtMask[
I]];
18291 CommonMask.
swap(NewMask);
18294 if (CommonMask.
empty()) {
18295 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18296 return InVectors.
front();
18298 if (InVectors.
size() == 2)
18299 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18300 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18304 assert((IsFinalized || CommonMask.empty()) &&
18305 "Shuffle construction must be finalized.");
18309Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18313template <
typename BVTy,
typename ResTy,
typename... Args>
18314ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18316 assert(E->isGather() &&
"Expected gather node.");
18317 unsigned VF = E->getVectorFactor();
18319 bool NeedFreeze =
false;
18322 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18324 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18327 E->CombinedEntriesWithIndices.size());
18328 transform(
E->CombinedEntriesWithIndices, SubVectors.begin(),
18329 [&](
const auto &
P) {
18330 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18335 E->ReorderIndices.end());
18336 if (!ReorderMask.empty())
18342 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18344 if (
E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18347 SubVectorsMask.
clear();
18351 unsigned I,
unsigned SliceSize,
18352 bool IsNotPoisonous) {
18354 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18357 TreeEntry *UserTE =
E->UserTreeIndex.UserTE;
18358 unsigned EdgeIdx =
E->UserTreeIndex.EdgeIdx;
18359 if (UserTE->getNumOperands() != 2)
18361 if (!IsNotPoisonous) {
18362 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18363 [=](
const std::unique_ptr<TreeEntry> &TE) {
18364 return TE->UserTreeIndex.UserTE == UserTE &&
18365 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18367 if (It == VectorizableTree.end())
18370 if (!(*It)->ReorderIndices.empty()) {
18374 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18375 Value *V0 = std::get<0>(
P);
18376 Value *V1 = std::get<1>(
P);
18384 if ((
Mask.size() < InputVF &&
18387 (
Mask.size() == InputVF &&
18390 std::next(
Mask.begin(),
I * SliceSize),
18391 std::next(
Mask.begin(),
18398 std::next(
Mask.begin(),
I * SliceSize),
18399 std::next(
Mask.begin(),
18405 BVTy ShuffleBuilder(ScalarTy, Params...);
18406 ResTy Res = ResTy();
18407 SmallVector<int>
Mask;
18408 SmallVector<int> ExtractMask(GatheredScalars.size(),
PoisonMaskElem);
18410 Value *ExtractVecBase =
nullptr;
18411 bool UseVecBaseAsInput =
false;
18414 Type *OrigScalarTy = GatheredScalars.front()->getType();
18419 bool Resized =
false;
18421 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18422 if (!ExtractShuffles.
empty()) {
18424 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
18430 ExtractEntries.
append(TEs.begin(), TEs.end());
18432 if (std::optional<ResTy> Delayed =
18433 ShuffleBuilder.needToDelay(
E, ExtractEntries)) {
18435 PostponedGathers.insert(
E);
18440 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18441 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18442 ExtractVecBase = VecBase;
18444 if (VF == VecBaseTy->getNumElements() &&
18445 GatheredScalars.size() != VF) {
18447 GatheredScalars.append(VF - GatheredScalars.size(),
18455 if (!ExtractShuffles.
empty() || !
E->hasState() ||
18456 E->getOpcode() != Instruction::Load ||
18457 (((
E->hasState() &&
E->getOpcode() == Instruction::Load) ||
18461 return isa<LoadInst>(V) && isVectorized(V);
18463 (
E->hasState() &&
E->isAltShuffle()) ||
18464 all_of(
E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18466 (
E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18468 isGatherShuffledEntry(
E, GatheredScalars, Mask, Entries, NumParts);
18470 if (!GatherShuffles.
empty()) {
18471 if (std::optional<ResTy> Delayed =
18472 ShuffleBuilder.needToDelay(
E, Entries)) {
18474 PostponedGathers.insert(
E);
18479 if (GatherShuffles.
size() == 1 &&
18481 Entries.
front().front()->isSame(
E->Scalars)) {
18484 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18487 Mask.resize(
E->Scalars.size());
18488 const TreeEntry *FrontTE = Entries.
front().front();
18489 if (FrontTE->ReorderIndices.empty() &&
18490 ((FrontTE->ReuseShuffleIndices.empty() &&
18491 E->Scalars.size() == FrontTE->Scalars.size()) ||
18492 (
E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18493 std::iota(
Mask.begin(),
Mask.end(), 0);
18500 Mask[
I] = FrontTE->findLaneForValue(V);
18505 ShuffleBuilder.resetForSameNode();
18506 ShuffleBuilder.add(*FrontTE, Mask);
18508 Res = ShuffleBuilder.finalize(
E->getCommonMask(), {}, {});
18512 if (GatheredScalars.size() != VF &&
18514 return any_of(TEs, [&](
const TreeEntry *TE) {
18515 return TE->getVectorFactor() == VF;
18518 GatheredScalars.append(VF - GatheredScalars.size(),
18522 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
18528 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18529 SmallVectorImpl<int> &ReuseMask,
18530 bool IsRootPoison) {
18533 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18536 SmallVector<int> UndefPos;
18537 DenseMap<Value *, unsigned> UniquePositions;
18540 int NumNonConsts = 0;
18559 Scalars.
front() = OrigV;
18562 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18563 Scalars[Res.first->second] = OrigV;
18564 ReuseMask[
I] = Res.first->second;
18567 if (NumNonConsts == 1) {
18572 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18575 ReuseMask[SinglePos] = SinglePos;
18576 }
else if (!UndefPos.
empty() && IsSplat) {
18583 (
E->UserTreeIndex &&
any_of(
V->uses(), [
E](
const Use &U) {
18586 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18587 is_contained(E->UserTreeIndex.UserTE->Scalars,
18591 if (It != Scalars.
end()) {
18593 int Pos = std::distance(Scalars.
begin(), It);
18594 for (
int I : UndefPos) {
18596 ReuseMask[
I] = Pos;
18605 for (
int I : UndefPos) {
18614 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18615 bool IsNonPoisoned =
true;
18616 bool IsUsedInExpr =
true;
18617 Value *Vec1 =
nullptr;
18618 if (!ExtractShuffles.
empty()) {
18622 Value *Vec2 =
nullptr;
18623 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18627 if (UseVecBaseAsInput) {
18628 Vec1 = ExtractVecBase;
18630 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18636 Value *VecOp = EI->getVectorOperand();
18638 !TEs.
empty() && TEs.
front()->VectorizedValue)
18639 VecOp = TEs.
front()->VectorizedValue;
18642 }
else if (Vec1 != VecOp) {
18643 assert((!Vec2 || Vec2 == VecOp) &&
18644 "Expected only 1 or 2 vectors shuffle.");
18650 IsUsedInExpr =
false;
18653 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18656 IsUsedInExpr &= FindReusedSplat(
18659 ExtractMask.size(), IsNotPoisonedVec);
18660 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18661 IsNonPoisoned &= IsNotPoisonedVec;
18663 IsUsedInExpr =
false;
18668 if (!GatherShuffles.
empty()) {
18669 unsigned SliceSize =
18673 for (
const auto [
I, TEs] :
enumerate(Entries)) {
18676 "No shuffles with empty entries list expected.");
18680 "Expected shuffle of 1 or 2 entries.");
18684 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
18685 if (TEs.
size() == 1) {
18686 bool IsNotPoisonedVec =
18687 TEs.
front()->VectorizedValue
18691 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
18692 SliceSize, IsNotPoisonedVec);
18693 ShuffleBuilder.add(*TEs.
front(), VecMask);
18694 IsNonPoisoned &= IsNotPoisonedVec;
18696 IsUsedInExpr =
false;
18697 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
18698 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
18709 int EMSz = ExtractMask.size();
18710 int MSz =
Mask.size();
18713 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
18714 bool IsIdentityShuffle =
18715 ((UseVecBaseAsInput ||
18717 [](
const std::optional<TTI::ShuffleKind> &SK) {
18721 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
18723 (!GatherShuffles.
empty() &&
18725 [](
const std::optional<TTI::ShuffleKind> &SK) {
18729 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
18731 bool EnoughConstsForShuffle =
18741 (!IsIdentityShuffle ||
18742 (GatheredScalars.size() == 2 &&
18750 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
18751 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
18758 SmallVector<int> BVMask(GatheredScalars.size(),
PoisonMaskElem);
18759 TryPackScalars(GatheredScalars, BVMask,
true);
18760 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18761 ShuffleBuilder.add(BV, BVMask);
18765 (IsSingleShuffle && ((IsIdentityShuffle &&
18768 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18771 Res = ShuffleBuilder.finalize(
18772 E->ReuseShuffleIndices, SubVectors, SubVectorsMask,
E->Scalars.size(),
18773 [&](
Value *&Vec, SmallVectorImpl<int> &Mask,
auto CreateShuffle) {
18774 bool IsSplat = isSplat(NonConstants);
18775 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18776 TryPackScalars(NonConstants, BVMask, false);
18777 auto CheckIfSplatIsProfitable = [&]() {
18780 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18781 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18782 if (isa<ExtractElementInst>(V) || isVectorized(V))
18784 InstructionCost SplatCost = TTI->getVectorInstrCost(
18785 Instruction::InsertElement, VecTy, CostKind, 0,
18786 PoisonValue::get(VecTy), V);
18787 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18788 for (auto [Idx, I] : enumerate(BVMask))
18789 if (I != PoisonMaskElem)
18790 NewMask[Idx] = Mask.size();
18791 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18792 NewMask, CostKind);
18793 InstructionCost BVCost = TTI->getVectorInstrCost(
18794 Instruction::InsertElement, VecTy, CostKind,
18795 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18798 if (count(BVMask, PoisonMaskElem) <
18799 static_cast<int>(BVMask.size() - 1)) {
18800 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18801 for (auto [Idx, I] : enumerate(BVMask))
18802 if (I != PoisonMaskElem)
18804 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18805 VecTy, NewMask, CostKind);
18807 return SplatCost <= BVCost;
18809 if (!IsSplat ||
Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18813 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
18819 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18821 transform(BVMask, SplatMask.begin(), [](
int I) {
18822 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18825 BV = CreateShuffle(BV,
nullptr, SplatMask);
18828 Mask[Idx] = BVMask.size() + Idx;
18829 Vec = CreateShuffle(Vec, BV, Mask);
18837 SmallVector<int> ReuseMask(GatheredScalars.size(),
PoisonMaskElem);
18838 TryPackScalars(GatheredScalars, ReuseMask,
true);
18839 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
18840 ShuffleBuilder.add(BV, ReuseMask);
18841 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18846 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
18850 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18851 ShuffleBuilder.add(BV, Mask);
18852 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18857 Res = ShuffleBuilder.createFreeze(Res);
18861Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
18862 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
18864 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
18872 for (
Value *V : VL)
18885 IRBuilderBase::InsertPointGuard Guard(Builder);
18887 Value *
V =
E->Scalars.front();
18888 Type *ScalarTy =
V->getType();
18891 auto It = MinBWs.find(
E);
18892 if (It != MinBWs.end()) {
18898 if (
E->VectorizedValue)
18899 return E->VectorizedValue;
18901 if (
E->isGather()) {
18903 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
18904 setInsertPointAfterBundle(
E);
18905 Value *Vec = createBuildVector(
E, ScalarTy);
18906 E->VectorizedValue = Vec;
18909 if (
E->State == TreeEntry::SplitVectorize) {
18910 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
18911 "Expected exactly 2 combined entries.");
18912 setInsertPointAfterBundle(
E);
18914 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
18916 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
18917 "Expected same first part of scalars.");
18920 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
18922 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
18923 "Expected same second part of scalars.");
18925 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
18926 bool IsSigned =
false;
18927 auto It = MinBWs.find(OpE);
18928 if (It != MinBWs.end())
18929 IsSigned = It->second.second;
18932 if (isa<PoisonValue>(V))
18934 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18941 Op1 = Builder.CreateIntCast(
18946 GetOperandSignedness(&OpTE1));
18951 Op2 = Builder.CreateIntCast(
18956 GetOperandSignedness(&OpTE2));
18958 if (
E->ReorderIndices.empty()) {
18962 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
18965 if (ScalarTyNumElements != 1) {
18969 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
18971 E->CombinedEntriesWithIndices.back().second *
18972 ScalarTyNumElements);
18973 E->VectorizedValue = Vec;
18976 unsigned CommonVF =
18977 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
18980 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
18982 Op1 = Builder.CreateShuffleVector(Op1, Mask);
18986 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
18988 Op2 = Builder.CreateShuffleVector(Op2, Mask);
18990 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
18991 E->VectorizedValue = Vec;
18995 bool IsReverseOrder =
18997 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
18999 if (
E->getOpcode() == Instruction::Store &&
19000 E->State == TreeEntry::Vectorize) {
19001 ArrayRef<int>
Mask =
19002 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19003 E->ReorderIndices.size());
19004 ShuffleBuilder.add(V, Mask);
19005 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19006 E->State == TreeEntry::CompressVectorize) {
19007 ShuffleBuilder.addOrdered(V, {});
19009 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19012 E->CombinedEntriesWithIndices.size());
19014 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19015 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19018 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19019 "Expected either combined subnodes or reordering");
19020 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19023 assert(!
E->isGather() &&
"Unhandled state");
19024 unsigned ShuffleOrOp =
19025 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19027 auto GetOperandSignedness = [&](
unsigned Idx) {
19028 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19029 bool IsSigned =
false;
19030 auto It = MinBWs.find(OpE);
19031 if (It != MinBWs.end())
19032 IsSigned = It->second.second;
19035 if (isa<PoisonValue>(V))
19037 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19041 switch (ShuffleOrOp) {
19042 case Instruction::PHI: {
19043 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19044 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19045 "PHI reordering is free.");
19047 Builder.SetInsertPoint(PH->getParent(),
19048 PH->getParent()->getFirstNonPHIIt());
19050 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19054 Builder.SetInsertPoint(PH->getParent(),
19055 PH->getParent()->getFirstInsertionPt());
19058 V = FinalShuffle(V,
E);
19060 E->VectorizedValue =
V;
19067 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19074 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19078 if (!VisitedBBs.
insert(IBB).second) {
19081 TreeEntry *OpTE = getOperandEntry(
E,
I);
19082 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19083 OpTE->VectorizedValue = VecOp;
19089 Value *Vec = vectorizeOperand(
E,
I);
19090 if (VecTy != Vec->
getType()) {
19092 MinBWs.contains(getOperandEntry(
E,
I))) &&
19093 "Expected item in MinBWs.");
19094 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19100 "Invalid number of incoming values");
19101 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19102 return E->VectorizedValue;
19105 case Instruction::ExtractElement: {
19106 Value *
V =
E->getSingleOperand(0);
19107 setInsertPointAfterBundle(
E);
19108 V = FinalShuffle(V,
E);
19109 E->VectorizedValue =
V;
19112 case Instruction::ExtractValue: {
19114 Builder.SetInsertPoint(LI);
19115 Value *
Ptr = LI->getPointerOperand();
19116 LoadInst *
V = Builder.CreateAlignedLoad(VecTy,
Ptr, LI->getAlign());
19118 NewV = FinalShuffle(NewV,
E);
19119 E->VectorizedValue = NewV;
19122 case Instruction::InsertElement: {
19123 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19124 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
19125 OpE && !OpE->isGather() && OpE->hasState() &&
19126 !OpE->hasCopyableElements())
19129 setInsertPointAfterBundle(
E);
19130 Value *
V = vectorizeOperand(
E, 1);
19132 Type *ScalarTy =
Op.front()->getType();
19135 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
19136 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19137 V = Builder.CreateIntCast(
19147 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19149 const unsigned NumElts =
19151 const unsigned NumScalars =
E->Scalars.size();
19154 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19157 SmallVector<int>
Mask;
19158 if (!
E->ReorderIndices.empty()) {
19163 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19166 bool IsIdentity =
true;
19168 Mask.swap(PrevMask);
19169 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19172 IsIdentity &= InsertIdx -
Offset ==
I;
19175 if (!IsIdentity || NumElts != NumScalars) {
19176 Value *V2 =
nullptr;
19177 bool IsVNonPoisonous =
19179 SmallVector<int> InsertMask(Mask);
19180 if (NumElts != NumScalars &&
Offset == 0) {
19189 InsertMask[*InsertIdx] = *InsertIdx;
19190 if (!
Ins->hasOneUse())
19193 Ins->getUniqueUndroppableUser());
19195 SmallBitVector UseMask =
19196 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19197 SmallBitVector IsFirstPoison =
19199 SmallBitVector IsFirstUndef =
19201 if (!IsFirstPoison.
all()) {
19203 for (
unsigned I = 0;
I < NumElts;
I++) {
19205 IsFirstUndef.
test(
I)) {
19206 if (IsVNonPoisonous) {
19207 InsertMask[
I] =
I < NumScalars ?
I : 0;
19212 if (Idx >= NumScalars)
19213 Idx = NumScalars - 1;
19214 InsertMask[
I] = NumScalars + Idx;
19227 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19229 GatherShuffleExtractSeq.insert(
I);
19230 CSEBlocks.insert(
I->getParent());
19235 for (
unsigned I = 0;
I < NumElts;
I++) {
19239 SmallBitVector UseMask =
19240 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19241 SmallBitVector IsFirstUndef =
19243 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19244 NumElts != NumScalars) {
19245 if (IsFirstUndef.
all()) {
19247 SmallBitVector IsFirstPoison =
19249 if (!IsFirstPoison.
all()) {
19250 for (
unsigned I = 0;
I < NumElts;
I++) {
19252 InsertMask[
I] =
I + NumElts;
19255 V = Builder.CreateShuffleVector(
19261 GatherShuffleExtractSeq.insert(
I);
19262 CSEBlocks.insert(
I->getParent());
19266 SmallBitVector IsFirstPoison =
19268 for (
unsigned I = 0;
I < NumElts;
I++) {
19272 InsertMask[
I] += NumElts;
19274 V = Builder.CreateShuffleVector(
19275 FirstInsert->getOperand(0), V, InsertMask,
19278 GatherShuffleExtractSeq.insert(
I);
19279 CSEBlocks.insert(
I->getParent());
19284 ++NumVectorInstructions;
19285 E->VectorizedValue =
V;
19288 case Instruction::ZExt:
19289 case Instruction::SExt:
19290 case Instruction::FPToUI:
19291 case Instruction::FPToSI:
19292 case Instruction::FPExt:
19293 case Instruction::PtrToInt:
19294 case Instruction::IntToPtr:
19295 case Instruction::SIToFP:
19296 case Instruction::UIToFP:
19297 case Instruction::Trunc:
19298 case Instruction::FPTrunc:
19299 case Instruction::BitCast: {
19300 setInsertPointAfterBundle(
E);
19302 Value *InVec = vectorizeOperand(
E, 0);
19307 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
19309 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19312 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19313 if (SrcIt != MinBWs.end())
19314 SrcBWSz = SrcIt->second.first;
19315 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
19316 if (BWSz == SrcBWSz) {
19317 VecOpcode = Instruction::BitCast;
19318 }
else if (BWSz < SrcBWSz) {
19319 VecOpcode = Instruction::Trunc;
19320 }
else if (It != MinBWs.end()) {
19321 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19322 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19323 }
else if (SrcIt != MinBWs.end()) {
19324 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19326 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19328 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19329 !SrcIt->second.second) {
19330 VecOpcode = Instruction::UIToFP;
19332 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19334 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19335 V = FinalShuffle(V,
E);
19337 E->VectorizedValue =
V;
19338 ++NumVectorInstructions;
19341 case Instruction::FCmp:
19342 case Instruction::ICmp: {
19343 setInsertPointAfterBundle(
E);
19345 Value *
L = vectorizeOperand(
E, 0);
19346 Value *
R = vectorizeOperand(
E, 1);
19347 if (
L->getType() !=
R->getType()) {
19350 MinBWs.contains(getOperandEntry(
E, 0)) ||
19351 MinBWs.contains(getOperandEntry(
E, 1))) &&
19352 "Expected item in MinBWs.");
19357 ->getIntegerBitWidth()) {
19358 Type *CastTy =
R->getType();
19359 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19361 Type *CastTy =
L->getType();
19362 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19367 Value *
V = Builder.CreateCmp(P0, L, R);
19370 ICmp->setSameSign(
false);
19373 V = FinalShuffle(V,
E);
19375 E->VectorizedValue =
V;
19376 ++NumVectorInstructions;
19379 case Instruction::Select: {
19380 setInsertPointAfterBundle(
E);
19383 Value *True = vectorizeOperand(
E, 1);
19384 Value *False = vectorizeOperand(
E, 2);
19388 MinBWs.contains(getOperandEntry(
E, 1)) ||
19389 MinBWs.contains(getOperandEntry(
E, 2))) &&
19390 "Expected item in MinBWs.");
19391 if (True->
getType() != VecTy)
19392 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19393 if (False->
getType() != VecTy)
19394 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19399 assert(TrueNumElements >= CondNumElements &&
19400 TrueNumElements % CondNumElements == 0 &&
19401 "Cannot vectorize Instruction::Select");
19403 "Cannot vectorize Instruction::Select");
19404 if (CondNumElements != TrueNumElements) {
19407 Cond = Builder.CreateShuffleVector(
19412 "Cannot vectorize Instruction::Select");
19413 Value *
V = Builder.CreateSelect(
Cond, True, False);
19414 V = FinalShuffle(V,
E);
19416 E->VectorizedValue =
V;
19417 ++NumVectorInstructions;
19420 case Instruction::FNeg: {
19421 setInsertPointAfterBundle(
E);
19423 Value *
Op = vectorizeOperand(
E, 0);
19425 Value *
V = Builder.CreateUnOp(
19431 V = FinalShuffle(V,
E);
19433 E->VectorizedValue =
V;
19434 ++NumVectorInstructions;
19438 case Instruction::Freeze: {
19439 setInsertPointAfterBundle(
E);
19441 Value *
Op = vectorizeOperand(
E, 0);
19443 if (
Op->getType() != VecTy) {
19445 MinBWs.contains(getOperandEntry(
E, 0))) &&
19446 "Expected item in MinBWs.");
19447 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
19449 Value *
V = Builder.CreateFreeze(
Op);
19450 V = FinalShuffle(V,
E);
19452 E->VectorizedValue =
V;
19453 ++NumVectorInstructions;
19457 case Instruction::Add:
19458 case Instruction::FAdd:
19459 case Instruction::Sub:
19460 case Instruction::FSub:
19461 case Instruction::Mul:
19462 case Instruction::FMul:
19463 case Instruction::UDiv:
19464 case Instruction::SDiv:
19465 case Instruction::FDiv:
19466 case Instruction::URem:
19467 case Instruction::SRem:
19468 case Instruction::FRem:
19469 case Instruction::Shl:
19470 case Instruction::LShr:
19471 case Instruction::AShr:
19472 case Instruction::And:
19473 case Instruction::Or:
19474 case Instruction::Xor: {
19475 setInsertPointAfterBundle(
E);
19479 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19484 return CI && CI->getValue().countr_one() >= It->second.first;
19486 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
19487 E->VectorizedValue =
V;
19488 ++NumVectorInstructions;
19496 MinBWs.contains(getOperandEntry(
E, 0)) ||
19497 MinBWs.contains(getOperandEntry(
E, 1))) &&
19498 "Expected item in MinBWs.");
19500 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
19502 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
19505 Value *
V = Builder.CreateBinOp(
19512 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
19514 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19516 I->setHasNoUnsignedWrap(
false);
19519 V = FinalShuffle(V,
E);
19521 E->VectorizedValue =
V;
19522 ++NumVectorInstructions;
19526 case Instruction::Load: {
19529 setInsertPointAfterBundle(
E);
19533 FixedVectorType *StridedLoadTy =
nullptr;
19534 Value *PO = LI->getPointerOperand();
19535 if (
E->State == TreeEntry::Vectorize) {
19536 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19537 }
else if (
E->State == TreeEntry::CompressVectorize) {
19538 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19539 CompressEntryToData.at(
E);
19540 Align CommonAlignment = LI->getAlign();
19546 for (
int I : CompressMask)
19550 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
19553 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19556 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19567 }
else if (
E->State == TreeEntry::StridedVectorize) {
19570 PO = IsReverseOrder ? PtrN : Ptr0;
19571 Type *StrideTy = DL->getIndexType(PO->
getType());
19573 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
19574 StridedLoadTy = SPtrInfo.Ty;
19575 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
19576 unsigned StridedLoadEC =
19579 Value *Stride = SPtrInfo.StrideVal;
19581 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19582 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
19583 SCEVExpander Expander(*SE, *DL,
"strided-load-vec");
19584 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
19585 &*Builder.GetInsertPoint());
19588 Builder.CreateIntCast(Stride, StrideTy,
true);
19589 StrideVal = Builder.CreateMul(
19590 NewStride, ConstantInt::get(
19591 StrideTy, (IsReverseOrder ? -1 : 1) *
19593 DL->getTypeAllocSize(ScalarTy))));
19595 auto *Inst = Builder.CreateIntrinsic(
19596 Intrinsic::experimental_vp_strided_load,
19597 {StridedLoadTy, PO->
getType(), StrideTy},
19600 Builder.getInt32(StridedLoadEC)});
19601 Inst->addParamAttr(
19606 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19607 Value *VecPtr = vectorizeOperand(
E, 0);
19612 unsigned ScalarTyNumElements =
19614 unsigned VecTyNumElements =
19616 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19617 "Cannot expand getelementptr.");
19618 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19621 return Builder.getInt64(I % ScalarTyNumElements);
19623 VecPtr = Builder.CreateGEP(
19624 VecTy->getElementType(),
19625 Builder.CreateShuffleVector(
19631 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19633 Value *
V =
E->State == TreeEntry::CompressVectorize
19637 V = FinalShuffle(V,
E);
19638 E->VectorizedValue =
V;
19639 ++NumVectorInstructions;
19642 case Instruction::Store: {
19645 setInsertPointAfterBundle(
E);
19647 Value *VecValue = vectorizeOperand(
E, 0);
19648 if (VecValue->
getType() != VecTy)
19650 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19651 VecValue = FinalShuffle(VecValue,
E);
19655 if (
E->State == TreeEntry::Vectorize) {
19656 ST = Builder.CreateAlignedStore(VecValue,
Ptr,
SI->getAlign());
19658 assert(
E->State == TreeEntry::StridedVectorize &&
19659 "Expected either strided or consecutive stores.");
19660 if (!
E->ReorderIndices.empty()) {
19662 Ptr =
SI->getPointerOperand();
19665 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
19666 auto *Inst = Builder.CreateIntrinsic(
19667 Intrinsic::experimental_vp_strided_store,
19668 {VecTy,
Ptr->getType(), StrideTy},
19671 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19672 Builder.getAllOnesMask(VecTy->getElementCount()),
19673 Builder.getInt32(
E->Scalars.size())});
19674 Inst->addParamAttr(
19682 E->VectorizedValue =
V;
19683 ++NumVectorInstructions;
19686 case Instruction::GetElementPtr: {
19688 setInsertPointAfterBundle(
E);
19690 Value *Op0 = vectorizeOperand(
E, 0);
19693 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
19694 Value *OpVec = vectorizeOperand(
E, J);
19698 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19701 for (
Value *V :
E->Scalars) {
19708 V = FinalShuffle(V,
E);
19710 E->VectorizedValue =
V;
19711 ++NumVectorInstructions;
19715 case Instruction::Call: {
19717 setInsertPointAfterBundle(
E);
19722 CI,
ID, VecTy->getNumElements(),
19723 It != MinBWs.end() ? It->second.first : 0, TTI);
19726 VecCallCosts.first <= VecCallCosts.second;
19728 Value *ScalarArg =
nullptr;
19739 ScalarArg = CEI->getArgOperand(
I);
19742 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
19743 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19744 ScalarArg = Builder.getFalse();
19751 Value *OpVec = vectorizeOperand(
E,
I);
19752 ScalarArg = CEI->getArgOperand(
I);
19755 It == MinBWs.end()) {
19758 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
19759 }
else if (It != MinBWs.end()) {
19760 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
19769 if (!UseIntrinsic) {
19774 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19781 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
19784 V = FinalShuffle(V,
E);
19786 E->VectorizedValue =
V;
19787 ++NumVectorInstructions;
19790 case Instruction::ShuffleVector: {
19793 setInsertPointAfterBundle(
E);
19794 Value *Src = vectorizeOperand(
E, 0);
19797 SmallVector<int> NewMask(ThisMask.size());
19799 return SVSrc->getShuffleMask()[Mask];
19801 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19802 SVSrc->getOperand(1), NewMask);
19804 V = Builder.CreateShuffleVector(Src, ThisMask);
19809 V = FinalShuffle(V,
E);
19817 "Invalid Shuffle Vector Operand");
19821 setInsertPointAfterBundle(
E);
19822 LHS = vectorizeOperand(
E, 0);
19823 RHS = vectorizeOperand(
E, 1);
19825 setInsertPointAfterBundle(
E);
19826 LHS = vectorizeOperand(
E, 0);
19832 assert((It != MinBWs.end() ||
19833 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
19834 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
19835 MinBWs.contains(getOperandEntry(
E, 0)) ||
19836 MinBWs.contains(getOperandEntry(
E, 1))) &&
19837 "Expected item in MinBWs.");
19838 Type *CastTy = VecTy;
19844 ->getIntegerBitWidth())
19850 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
19852 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
19857 V0 = Builder.CreateBinOp(
19859 V1 = Builder.CreateBinOp(
19862 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
19865 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
19868 unsigned SrcBWSz = DL->getTypeSizeInBits(
19870 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19871 if (BWSz <= SrcBWSz) {
19872 if (BWSz < SrcBWSz)
19873 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
19875 "Expected same type as operand.");
19879 E->VectorizedValue =
LHS;
19880 ++NumVectorInstructions;
19884 V0 = Builder.CreateCast(
19886 V1 = Builder.CreateCast(
19891 for (
Value *V : {V0, V1}) {
19893 GatherShuffleExtractSeq.insert(
I);
19894 CSEBlocks.insert(
I->getParent());
19902 SmallVector<int>
Mask;
19903 E->buildAltOpShuffleMask(
19904 [
E,
this](Instruction *
I) {
19905 assert(
E->getMatchingMainOpOrAltOp(
I) &&
19906 "Unexpected main/alternate opcode");
19910 Mask, &OpScalars, &AltScalars);
19914 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
19917 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
19919 if (isa<PoisonValue>(V))
19921 auto *IV = cast<Instruction>(V);
19922 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19924 I->setHasNoUnsignedWrap(
false);
19926 DropNuwFlag(V0,
E->getOpcode());
19927 DropNuwFlag(V1,
E->getAltOpcode());
19933 V = Builder.CreateShuffleVector(V0, V1, Mask);
19936 GatherShuffleExtractSeq.insert(
I);
19937 CSEBlocks.insert(
I->getParent());
19941 E->VectorizedValue =
V;
19942 ++NumVectorInstructions;
19960 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
19963 EntryToLastInstruction.clear();
19965 for (
auto &BSIter : BlocksSchedules)
19966 scheduleBlock(*
this, BSIter.second.get());
19969 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19970 if (TE->isGather())
19972 (void)getLastInstructionInBundle(TE.get());
19976 Builder.SetInsertPoint(ReductionRoot->
getParent(),
19979 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
19983 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19984 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
19985 TE->UserTreeIndex.UserTE->hasState() &&
19986 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
19987 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
19988 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
19989 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
19990 all_of(TE->UserTreeIndex.UserTE->Scalars,
19991 [](
Value *V) { return isUsedOutsideBlock(V); })) {
19993 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
19997 for (
auto &Entry : GatherEntries) {
19999 Builder.SetInsertPoint(Entry.second);
20000 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20005 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20006 if (GatheredLoadsEntriesFirst.has_value() &&
20007 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20008 (!TE->isGather() || TE->UserTreeIndex)) {
20009 assert((TE->UserTreeIndex ||
20010 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20011 "Expected gathered load node.");
20020 for (
const TreeEntry *E : PostponedNodes) {
20021 auto *TE =
const_cast<TreeEntry *
>(E);
20023 TE->VectorizedValue =
nullptr;
20042 if (UI->comesBefore(InsertPt))
20045 Builder.SetInsertPoint(InsertPt);
20047 Builder.SetInsertPoint(PrevVec);
20049 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20052 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20053 Builder.GetInsertPoint()->comesBefore(VecI))
20054 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20055 Builder.GetInsertPoint());
20056 if (Vec->
getType() != PrevVec->getType()) {
20058 PrevVec->getType()->isIntOrIntVectorTy() &&
20059 "Expected integer vector types only.");
20060 std::optional<bool> IsSigned;
20061 for (
Value *V : TE->Scalars) {
20063 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20064 auto It = MinBWs.find(MNTE);
20065 if (It != MinBWs.end()) {
20066 IsSigned = IsSigned.value_or(
false) || It->second.second;
20071 if (IsSigned.value_or(
false))
20074 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20075 auto It = MinBWs.find(BVE);
20076 if (It != MinBWs.end()) {
20077 IsSigned = IsSigned.value_or(
false) || It->second.second;
20082 if (IsSigned.value_or(
false))
20086 IsSigned.value_or(
false) ||
20090 if (IsSigned.value_or(
false))
20094 if (IsSigned.value_or(
false)) {
20096 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20097 if (It != MinBWs.end())
20098 IsSigned = It->second.second;
20101 "Expected user node or perfect diamond match in MinBWs.");
20102 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20104 PrevVec->replaceAllUsesWith(Vec);
20105 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20108 auto It = PostponedValues.
find(PrevVec);
20109 if (It != PostponedValues.
end()) {
20110 for (TreeEntry *VTE : It->getSecond())
20111 VTE->VectorizedValue = Vec;
20131 for (
const auto &ExternalUse : ExternalUses) {
20132 Value *Scalar = ExternalUse.Scalar;
20139 const TreeEntry *E = &ExternalUse.E;
20140 assert(E &&
"Invalid scalar");
20141 assert(!E->isGather() &&
"Extracting from a gather list");
20143 if (E->getOpcode() == Instruction::GetElementPtr &&
20147 Value *Vec = E->VectorizedValue;
20148 assert(Vec &&
"Can't find vectorizable value");
20150 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20151 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20152 if (Scalar->getType() != Vec->
getType()) {
20153 Value *Ex =
nullptr;
20154 Value *ExV =
nullptr;
20156 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20157 auto It = ScalarToEEs.
find(Scalar);
20158 if (It != ScalarToEEs.
end()) {
20161 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20162 : Builder.GetInsertBlock());
20163 if (EEIt != It->second.end()) {
20164 Value *PrevV = EEIt->second.first;
20166 I && !ReplaceInst &&
20167 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20168 Builder.GetInsertPoint()->comesBefore(
I)) {
20169 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20170 Builder.GetInsertPoint());
20175 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20184 IgnoredExtracts.
insert(EE);
20187 auto *CloneInst = Inst->clone();
20188 CloneInst->insertBefore(Inst->getIterator());
20189 if (Inst->hasName())
20190 CloneInst->takeName(Inst);
20195 Value *V = ES->getVectorOperand();
20198 V = ETEs.front()->VectorizedValue;
20200 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20201 IV->comesBefore(IVec))
20202 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20204 Ex = Builder.CreateExtractElement(Vec, Lane);
20205 }
else if (
auto *VecTy =
20208 unsigned VecTyNumElements = VecTy->getNumElements();
20213 ExternalUse.Lane * VecTyNumElements);
20215 Ex = Builder.CreateExtractElement(Vec, Lane);
20220 if (Scalar->getType() != Ex->
getType())
20221 ExV = Builder.CreateIntCast(
20226 : &F->getEntryBlock(),
20227 std::make_pair(Ex, ExV));
20233 GatherShuffleExtractSeq.insert(ExI);
20234 CSEBlocks.insert(ExI->getParent());
20240 "In-tree scalar of vector type is not insertelement?");
20249 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20252 (ExternallyUsedValues.
count(Scalar) ||
20253 ExternalUsesWithNonUsers.count(Scalar) ||
20254 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20258 if (ExternalUsesAsOriginalScalar.contains(U))
20260 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20261 return !UseEntries.empty() &&
20262 (E->State == TreeEntry::Vectorize ||
20263 E->State == TreeEntry::StridedVectorize ||
20264 E->State == TreeEntry::CompressVectorize) &&
20265 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20266 return (UseEntry->State == TreeEntry::Vectorize ||
20268 TreeEntry::StridedVectorize ||
20270 TreeEntry::CompressVectorize) &&
20271 doesInTreeUserNeedToExtract(
20272 Scalar, getRootEntryInstruction(*UseEntry),
20276 "Scalar with nullptr User must be registered in "
20277 "ExternallyUsedValues map or remain as scalar in vectorized "
20281 if (
PHI->getParent()->isLandingPad())
20282 Builder.SetInsertPoint(
20285 PHI->getParent()->getLandingPadInst()->getIterator()));
20287 Builder.SetInsertPoint(
PHI->getParent(),
20288 PHI->getParent()->getFirstNonPHIIt());
20290 Builder.SetInsertPoint(VecI->getParent(),
20291 std::next(VecI->getIterator()));
20294 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20296 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20298 if (Scalar != NewInst) {
20301 "Extractelements should not be replaced.");
20302 Scalar->replaceAllUsesWith(NewInst);
20312 if (!UsedInserts.
insert(VU).second)
20315 auto BWIt = MinBWs.find(E);
20317 auto *ScalarTy = FTy->getElementType();
20318 auto Key = std::make_pair(Vec, ScalarTy);
20319 auto VecIt = VectorCasts.
find(
Key);
20320 if (VecIt == VectorCasts.
end()) {
20323 if (IVec->getParent()->isLandingPad())
20324 Builder.SetInsertPoint(IVec->getParent(),
20325 std::next(IVec->getParent()
20326 ->getLandingPadInst()
20329 Builder.SetInsertPoint(
20330 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20332 Builder.SetInsertPoint(IVec->getNextNode());
20334 Vec = Builder.CreateIntCast(
20339 BWIt->second.second);
20342 Vec = VecIt->second;
20349 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20356 unsigned Idx = *InsertIdx;
20357 if (It == ShuffledInserts.
end()) {
20359 It = std::next(ShuffledInserts.
begin(),
20360 ShuffledInserts.
size() - 1);
20365 Mask[Idx] = ExternalUse.Lane;
20377 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
20378 if (PH->getIncomingValue(
I) == Scalar) {
20380 PH->getIncomingBlock(
I)->getTerminator();
20382 Builder.SetInsertPoint(VecI->getParent(),
20383 std::next(VecI->getIterator()));
20385 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
20387 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20388 PH->setOperand(
I, NewInst);
20393 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20397 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20398 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20409 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20411 CombinedMask1[
I] = Mask[
I];
20413 CombinedMask2[
I] = Mask[
I] - VF;
20415 ShuffleInstructionBuilder ShuffleBuilder(
20417 ShuffleBuilder.add(V1, CombinedMask1);
20419 ShuffleBuilder.add(V2, CombinedMask2);
20420 return ShuffleBuilder.finalize({}, {}, {});
20423 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
20424 bool ForSingleMask) {
20425 unsigned VF =
Mask.size();
20428 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20429 Vec = CreateShuffle(Vec,
nullptr, Mask);
20430 return std::make_pair(Vec,
true);
20432 if (!ForSingleMask) {
20434 for (
unsigned I = 0;
I < VF; ++
I) {
20438 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20442 return std::make_pair(Vec,
false);
20446 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
20449 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
20450 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
20451 Builder.SetInsertPoint(LastInsert);
20452 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20457 return cast<VectorType>(Vec->getType())
20458 ->getElementCount()
20459 .getKnownMinValue();
20462 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20464 assert((Vals.size() == 1 || Vals.size() == 2) &&
20465 "Expected exactly 1 or 2 input values.");
20466 if (Vals.size() == 1) {
20469 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20470 ->getNumElements() ||
20471 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20472 return CreateShuffle(Vals.front(), nullptr, Mask);
20473 return Vals.front();
20475 return CreateShuffle(Vals.
front() ? Vals.
front()
20477 Vals.
back(), Mask);
20479 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20481 InsertElementInst *
II =
nullptr;
20482 if (It != ShuffledInserts[
I].InsertElements.rend())
20485 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20486 assert(
II &&
"Must be an insertelement instruction.");
20493 for (Instruction *
II :
reverse(Inserts)) {
20494 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20496 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20497 II->moveAfter(NewI);
20501 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
20502 IE->replaceUsesOfWith(
IE->getOperand(0),
20504 IE->replaceUsesOfWith(
IE->getOperand(1),
20508 CSEBlocks.insert(LastInsert->
getParent());
20513 for (
auto &TEPtr : VectorizableTree) {
20514 TreeEntry *
Entry = TEPtr.get();
20517 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20520 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20523 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20526 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20530 EE && IgnoredExtracts.contains(EE))
20537 for (User *U :
Scalar->users()) {
20542 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20545 "Deleting out-of-tree value");
20549 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20558 V->mergeDIAssignID(RemovedInsts);
20561 if (UserIgnoreList) {
20562 for (Instruction *
I : RemovedInsts) {
20563 const TreeEntry *
IE = getTreeEntries(
I).front();
20564 if (
IE->Idx != 0 &&
20565 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20566 (ValueToGatherNodes.lookup(
I).contains(
20567 VectorizableTree.front().get()) ||
20568 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20569 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20570 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20571 IE->UserTreeIndex &&
20573 !(GatheredLoadsEntriesFirst.has_value() &&
20574 IE->Idx >= *GatheredLoadsEntriesFirst &&
20575 VectorizableTree.front()->isGather() &&
20577 !(!VectorizableTree.front()->isGather() &&
20578 VectorizableTree.front()->isCopyableElement(
I)))
20583 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20584 (match(U.getUser(), m_LogicalAnd()) ||
20585 match(U.getUser(), m_LogicalOr())) &&
20586 U.getOperandNo() == 0;
20587 if (IsPoisoningLogicalOp) {
20588 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20591 return UserIgnoreList->contains(
U.getUser());
20595 for (SelectInst *SI : LogicalOpSelects)
20605 Builder.ClearInsertionPoint();
20606 InstrElementSize.clear();
20608 const TreeEntry &RootTE = *VectorizableTree.front();
20609 Value *Vec = RootTE.VectorizedValue;
20610 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20611 It != MinBWs.end() &&
20612 ReductionBitWidth != It->second.first) {
20613 IRBuilder<>::InsertPointGuard Guard(Builder);
20614 Builder.SetInsertPoint(ReductionRoot->getParent(),
20615 ReductionRoot->getIterator());
20616 Vec = Builder.CreateIntCast(
20620 It->second.second);
20626 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
20627 <<
" gather sequences instructions.\n");
20634 Loop *L = LI->getLoopFor(
I->getParent());
20639 BasicBlock *PreHeader = L->getLoopPreheader();
20647 auto *OpI = dyn_cast<Instruction>(V);
20648 return OpI && L->contains(OpI);
20654 CSEBlocks.insert(PreHeader);
20659 CSEWorkList.
reserve(CSEBlocks.size());
20662 assert(DT->isReachableFromEntry(
N));
20669 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
20670 "Different nodes should have different DFS numbers");
20671 return A->getDFSNumIn() <
B->getDFSNumIn();
20679 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
20682 if (I1->getType() != I2->getType())
20687 return I1->isIdenticalTo(I2);
20688 if (SI1->isIdenticalTo(SI2))
20690 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
20691 if (SI1->getOperand(
I) != SI2->getOperand(
I))
20694 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20698 unsigned LastUndefsCnt = 0;
20699 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
20705 NewMask[
I] != SM1[
I])
20708 NewMask[
I] = SM1[
I];
20712 return SM1.
size() - LastUndefsCnt > 1 &&
20716 SM1.
size() - LastUndefsCnt));
20722 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
20724 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
20725 "Worklist not sorted properly!");
20732 !GatherShuffleExtractSeq.contains(&In))
20737 bool Replaced =
false;
20740 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20741 DT->dominates(V->getParent(), In.getParent())) {
20742 In.replaceAllUsesWith(V);
20745 if (!NewMask.
empty())
20746 SI->setShuffleMask(NewMask);
20751 GatherShuffleExtractSeq.contains(V) &&
20752 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20753 DT->dominates(In.getParent(), V->getParent())) {
20755 V->replaceAllUsesWith(&In);
20758 if (!NewMask.
empty())
20759 SI->setShuffleMask(NewMask);
20767 Visited.push_back(&In);
20772 GatherShuffleExtractSeq.clear();
20775BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20778 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20779 for (
Value *V : VL) {
20780 if (S.isNonSchedulable(V))
20783 if (S.isCopyableElement(V)) {
20785 ScheduleCopyableData &SD =
20786 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
20788 BundlePtr->add(&SD);
20791 ScheduleData *BundleMember = getScheduleData(V);
20792 assert(BundleMember &&
"no ScheduleData for bundle member "
20793 "(maybe not in same basic block)");
20795 BundlePtr->add(BundleMember);
20796 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
20799 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
20805std::optional<BoUpSLP::ScheduleBundle *>
20807 const InstructionsState &S,
20814 bool HasCopyables = S.areInstructionsWithCopyableElements();
20816 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
20820 SmallVector<ScheduleData *> ControlDependentMembers;
20821 for (
Value *V : VL) {
20823 if (!
I || (HasCopyables && S.isCopyableElement(V)))
20825 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20826 for (
const Use &U :
I->operands()) {
20829 .first->getSecond();
20832 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
20833 if (ScheduleData *OpSD = getScheduleData(
Op);
20834 OpSD && OpSD->hasValidDependencies()) {
20835 OpSD->clearDirectDependencies();
20836 if (RegionHasStackSave ||
20838 ControlDependentMembers.
push_back(OpSD);
20843 if (!ControlDependentMembers.
empty()) {
20844 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20845 calculateDependencies(
Invalid,
true, SLP,
20846 ControlDependentMembers);
20853 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
20855 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
20858 SmallVector<ScheduleData *> ControlDependentMembers;
20859 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20860 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20861 for (ScheduleEntity *SE : Bundle.getBundle()) {
20863 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20864 BundleMember && BundleMember->hasValidDependencies()) {
20865 BundleMember->clearDirectDependencies();
20866 if (RegionHasStackSave ||
20868 BundleMember->getInst()))
20869 ControlDependentMembers.
push_back(BundleMember);
20874 if (SD->hasValidDependencies() &&
20875 (!S.areInstructionsWithCopyableElements() ||
20876 !S.isCopyableElement(SD->getInst())) &&
20877 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
20878 EI.UserTE->hasState() &&
20879 (!EI.UserTE->hasCopyableElements() ||
20880 !EI.UserTE->isCopyableElement(SD->getInst())))
20881 SD->clearDirectDependencies();
20882 for (
const Use &U : SD->getInst()->operands()) {
20885 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
20886 .first->getSecond();
20889 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
20891 if (ScheduleData *OpSD = getScheduleData(
Op);
20892 OpSD && OpSD->hasValidDependencies()) {
20893 OpSD->clearDirectDependencies();
20894 if (RegionHasStackSave ||
20896 ControlDependentMembers.
push_back(OpSD);
20907 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20908 for_each(ScheduleDataMap, [&](
auto &
P) {
20909 if (BB !=
P.first->getParent())
20911 ScheduleData *SD =
P.second;
20912 if (isInSchedulingRegion(*SD))
20913 SD->clearDependencies();
20915 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
20916 for_each(
P.second, [&](ScheduleCopyableData *SD) {
20917 if (isInSchedulingRegion(*SD))
20918 SD->clearDependencies();
20925 if (Bundle && !Bundle.getBundle().empty()) {
20926 if (S.areInstructionsWithCopyableElements() ||
20927 !ScheduleCopyableDataMap.empty())
20928 CheckIfNeedToClearDeps(Bundle);
20929 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
20931 calculateDependencies(Bundle, !ReSchedule, SLP,
20932 ControlDependentMembers);
20933 }
else if (!ControlDependentMembers.
empty()) {
20934 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20935 calculateDependencies(
Invalid, !ReSchedule, SLP,
20936 ControlDependentMembers);
20941 initialFillReadyList(ReadyInsts);
20948 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20949 !ReadyInsts.empty()) {
20950 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20951 assert(Picked->isReady() &&
"must be ready to schedule");
20952 schedule(*SLP, S, EI, Picked, ReadyInsts);
20953 if (Picked == &Bundle)
20960 for (
Value *V : VL) {
20961 if (S.isNonSchedulable(V))
20963 if (!extendSchedulingRegion(V, S)) {
20970 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20971 TryScheduleBundleImpl(
false,
Invalid);
20972 return std::nullopt;
20976 bool ReSchedule =
false;
20977 for (
Value *V : VL) {
20978 if (S.isNonSchedulable(V))
20982 if (!CopyableData.
empty()) {
20983 for (ScheduleCopyableData *SD : CopyableData)
20984 ReadyInsts.remove(SD);
20986 ScheduleData *BundleMember = getScheduleData(V);
20987 assert((BundleMember || S.isCopyableElement(V)) &&
20988 "no ScheduleData for bundle member (maybe not in same basic block)");
20994 ReadyInsts.remove(BundleMember);
20996 !Bundles.
empty()) {
20997 for (ScheduleBundle *
B : Bundles)
20998 ReadyInsts.remove(
B);
21001 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21008 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21009 <<
" was already scheduled\n");
21013 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21014 TryScheduleBundleImpl(ReSchedule, Bundle);
21015 if (!Bundle.isReady()) {
21016 for (ScheduleEntity *BD : Bundle.getBundle()) {
21020 if (BD->isReady()) {
21022 if (Bundles.
empty()) {
21023 ReadyInsts.insert(BD);
21026 for (ScheduleBundle *
B : Bundles)
21028 ReadyInsts.insert(
B);
21031 ScheduledBundlesList.pop_back();
21032 SmallVector<ScheduleData *> ControlDependentMembers;
21033 SmallPtrSet<Instruction *, 4> Visited;
21034 for (
Value *V : VL) {
21035 if (S.isNonSchedulable(V))
21038 if (S.isCopyableElement(
I)) {
21041 auto KV = std::make_pair(EI,
I);
21042 assert(ScheduleCopyableDataMap.contains(KV) &&
21043 "no ScheduleCopyableData for copyable element");
21044 ScheduleCopyableData *SD =
21045 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
21046 ScheduleCopyableDataMapByUsers[
I].remove(SD);
21049 const auto *It =
find(
Op,
I);
21050 assert(It !=
Op.end() &&
"Lane not set");
21051 SmallPtrSet<Instruction *, 4> Visited;
21053 int Lane = std::distance(
Op.begin(), It);
21054 assert(Lane >= 0 &&
"Lane not set");
21056 !EI.UserTE->ReorderIndices.empty())
21057 Lane = EI.UserTE->ReorderIndices[Lane];
21058 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21059 "Couldn't find extract lane");
21061 if (!Visited.
insert(In).second) {
21065 ScheduleCopyableDataMapByInstUser
21066 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
21069 }
while (It !=
Op.end());
21071 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
21072 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
21074 if (ScheduleCopyableDataMapByUsers[
I].
empty())
21075 ScheduleCopyableDataMapByUsers.erase(
I);
21076 ScheduleCopyableDataMap.erase(KV);
21078 if (ScheduleData *OpSD = getScheduleData(
I);
21079 OpSD && OpSD->hasValidDependencies()) {
21080 OpSD->clearDirectDependencies();
21081 if (RegionHasStackSave ||
21083 ControlDependentMembers.
push_back(OpSD);
21087 ScheduledBundles.find(
I)->getSecond().pop_back();
21089 if (!ControlDependentMembers.
empty()) {
21090 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21091 calculateDependencies(
Invalid,
false, SLP,
21092 ControlDependentMembers);
21094 return std::nullopt;
21099BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21101 if (ChunkPos >= ChunkSize) {
21102 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21105 return &(ScheduleDataChunks.back()[ChunkPos++]);
21108bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21109 Value *V,
const InstructionsState &S) {
21111 assert(
I &&
"bundle member must be an instruction");
21112 if (getScheduleData(
I))
21114 if (!ScheduleStart) {
21116 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21118 ScheduleEnd =
I->getNextNode();
21119 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21120 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21128 ++ScheduleStart->getIterator().getReverse();
21134 return II->isAssumeLikeIntrinsic();
21137 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21138 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21139 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21141 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21142 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21149 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21150 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21152 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21153 assert(
I->getParent() == ScheduleStart->getParent() &&
21154 "Instruction is in wrong basic block.");
21155 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21161 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21162 "Expected to reach top of the basic block or instruction down the "
21164 assert(
I->getParent() == ScheduleEnd->getParent() &&
21165 "Instruction is in wrong basic block.");
21166 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21168 ScheduleEnd =
I->getNextNode();
21169 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21170 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21174void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21176 ScheduleData *PrevLoadStore,
21177 ScheduleData *NextLoadStore) {
21178 ScheduleData *CurrentLoadStore = PrevLoadStore;
21183 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21185 SD = allocateScheduleDataChunks();
21186 ScheduleDataMap[
I] = SD;
21188 assert(!isInSchedulingRegion(*SD) &&
21189 "new ScheduleData already in scheduling region");
21190 SD->init(SchedulingRegionID,
I);
21192 if (
I->mayReadOrWriteMemory() &&
21196 Intrinsic::pseudoprobe))) {
21198 if (CurrentLoadStore) {
21199 CurrentLoadStore->setNextLoadStore(SD);
21201 FirstLoadStoreInRegion = SD;
21203 CurrentLoadStore = SD;
21208 RegionHasStackSave =
true;
21210 if (NextLoadStore) {
21211 if (CurrentLoadStore)
21212 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21214 LastLoadStoreInRegion = CurrentLoadStore;
21218void BoUpSLP::BlockScheduling::calculateDependencies(
21219 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21221 SmallVector<ScheduleEntity *> WorkList;
21222 auto ProcessNode = [&](ScheduleEntity *SE) {
21224 if (CD->hasValidDependencies())
21227 CD->initDependencies();
21228 CD->resetUnscheduledDeps();
21229 const EdgeInfo &EI = CD->getEdgeInfo();
21232 const auto *It =
find(
Op, CD->getInst());
21233 assert(It !=
Op.end() &&
"Lane not set");
21234 SmallPtrSet<Instruction *, 4> Visited;
21236 int Lane = std::distance(
Op.begin(), It);
21237 assert(Lane >= 0 &&
"Lane not set");
21239 !EI.UserTE->ReorderIndices.empty())
21240 Lane = EI.UserTE->ReorderIndices[Lane];
21241 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21242 "Couldn't find extract lane");
21244 if (EI.UserTE->isCopyableElement(In)) {
21247 if (ScheduleCopyableData *UseSD =
21248 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21249 CD->incDependencies();
21250 if (!UseSD->isScheduled())
21251 CD->incrementUnscheduledDeps(1);
21252 if (!UseSD->hasValidDependencies() ||
21253 (InsertInReadyList && UseSD->isReady()))
21256 }
else if (Visited.
insert(In).second) {
21257 if (ScheduleData *UseSD = getScheduleData(In)) {
21258 CD->incDependencies();
21259 if (!UseSD->isScheduled())
21260 CD->incrementUnscheduledDeps(1);
21261 if (!UseSD->hasValidDependencies() ||
21262 (InsertInReadyList && UseSD->isReady()))
21267 }
while (It !=
Op.end());
21268 if (CD->isReady() && CD->getDependencies() == 0 &&
21269 (EI.UserTE->hasState() &&
21270 (EI.UserTE->getMainOp()->getParent() !=
21271 CD->getInst()->getParent() ||
21273 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21274 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21275 auto *IU = dyn_cast<Instruction>(U);
21278 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21284 CD->incDependencies();
21285 CD->incrementUnscheduledDeps(1);
21291 if (BundleMember->hasValidDependencies())
21293 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21294 BundleMember->initDependencies();
21295 BundleMember->resetUnscheduledDeps();
21297 SmallDenseMap<Value *, unsigned> UserToNumOps;
21298 for (User *U : BundleMember->getInst()->users()) {
21301 if (ScheduleData *UseSD = getScheduleData(U)) {
21305 if (areAllOperandsReplacedByCopyableData(
21308 BundleMember->incDependencies();
21309 if (!UseSD->isScheduled())
21310 BundleMember->incrementUnscheduledDeps(1);
21311 if (!UseSD->hasValidDependencies() ||
21312 (InsertInReadyList && UseSD->isReady()))
21316 for (ScheduleCopyableData *UseSD :
21317 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21318 BundleMember->incDependencies();
21319 if (!UseSD->isScheduled())
21320 BundleMember->incrementUnscheduledDeps(1);
21321 if (!UseSD->hasValidDependencies() ||
21322 (InsertInReadyList && UseSD->isReady()))
21326 SmallPtrSet<const Instruction *, 4> Visited;
21329 if (!Visited.
insert(
I).second)
21331 auto *DepDest = getScheduleData(
I);
21332 assert(DepDest &&
"must be in schedule window");
21333 DepDest->addControlDependency(BundleMember);
21334 BundleMember->incDependencies();
21335 if (!DepDest->isScheduled())
21336 BundleMember->incrementUnscheduledDeps(1);
21337 if (!DepDest->hasValidDependencies() ||
21338 (InsertInReadyList && DepDest->isReady()))
21346 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21347 I != ScheduleEnd;
I =
I->getNextNode()) {
21352 MakeControlDependent(
I);
21360 if (RegionHasStackSave) {
21365 match(BundleMember->getInst(),
21367 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21368 I != ScheduleEnd;
I =
I->getNextNode()) {
21379 MakeControlDependent(
I);
21389 BundleMember->getInst()->mayReadOrWriteMemory()) {
21390 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21391 I != ScheduleEnd;
I =
I->getNextNode()) {
21397 MakeControlDependent(
I);
21404 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21405 if (!NextLoadStore)
21409 "NextLoadStore list for non memory effecting bundle?");
21412 unsigned NumAliased = 0;
21413 unsigned DistToSrc = 1;
21414 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21416 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21417 DepDest = DepDest->getNextLoadStore()) {
21418 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21428 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21430 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21437 DepDest->addMemoryDependency(BundleMember);
21438 BundleMember->incDependencies();
21439 if (!DepDest->isScheduled())
21440 BundleMember->incrementUnscheduledDeps(1);
21441 if (!DepDest->hasValidDependencies() ||
21442 (InsertInReadyList && DepDest->isReady()))
21466 "expected at least one instruction to schedule");
21468 WorkList.
push_back(Bundle.getBundle().front());
21470 SmallPtrSet<ScheduleBundle *, 16> Visited;
21471 while (!WorkList.
empty()) {
21476 CopyableBundle.
push_back(&CD->getBundle());
21477 Bundles = CopyableBundle;
21479 Bundles = getScheduleBundles(SD->getInst());
21481 if (Bundles.
empty()) {
21482 if (!SD->hasValidDependencies())
21484 if (InsertInReadyList && SD->isReady()) {
21485 ReadyInsts.insert(SD);
21486 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21490 for (ScheduleBundle *Bundle : Bundles) {
21491 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21493 assert(isInSchedulingRegion(*Bundle) &&
21494 "ScheduleData not in scheduling region");
21495 for_each(Bundle->getBundle(), ProcessNode);
21497 if (InsertInReadyList && SD->isReady()) {
21498 for (ScheduleBundle *Bundle : Bundles) {
21499 assert(isInSchedulingRegion(*Bundle) &&
21500 "ScheduleData not in scheduling region");
21501 if (!Bundle->isReady())
21503 ReadyInsts.insert(Bundle);
21511void BoUpSLP::BlockScheduling::resetSchedule() {
21513 "tried to reset schedule on block which has not been scheduled");
21514 for_each(ScheduleDataMap, [&](
auto &
P) {
21515 if (BB !=
P.first->getParent())
21517 ScheduleData *SD =
P.second;
21518 if (isInSchedulingRegion(*SD)) {
21519 SD->setScheduled(
false);
21520 SD->resetUnscheduledDeps();
21523 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21524 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21525 if (isInSchedulingRegion(*SD)) {
21526 SD->setScheduled(false);
21527 SD->resetUnscheduledDeps();
21531 for_each(ScheduledBundles, [&](
auto &
P) {
21532 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21533 if (isInSchedulingRegion(*Bundle))
21534 Bundle->setScheduled(false);
21538 for (
auto &
P : ScheduleCopyableDataMap) {
21539 if (isInSchedulingRegion(*
P.second)) {
21540 P.second->setScheduled(
false);
21541 P.second->resetUnscheduledDeps();
21544 ReadyInsts.clear();
21547void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21548 if (!BS->ScheduleStart)
21551 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21558 BS->resetSchedule();
21565 struct ScheduleDataCompare {
21566 bool operator()(
const ScheduleEntity *SD1,
21567 const ScheduleEntity *SD2)
const {
21568 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21571 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21576 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21577 I =
I->getNextNode()) {
21579 if (!Bundles.
empty()) {
21580 for (ScheduleBundle *Bundle : Bundles) {
21581 Bundle->setSchedulingPriority(Idx++);
21582 if (!Bundle->hasValidDependencies())
21583 BS->calculateDependencies(*Bundle,
false,
this);
21586 for (ScheduleCopyableData *SD :
reverse(SDs)) {
21587 ScheduleBundle &Bundle = SD->getBundle();
21588 Bundle.setSchedulingPriority(Idx++);
21589 if (!Bundle.hasValidDependencies())
21590 BS->calculateDependencies(Bundle,
false,
this);
21595 BS->getScheduleCopyableDataUsers(
I);
21596 if (ScheduleData *SD = BS->getScheduleData(
I)) {
21599 SDTEs.
front()->doesNotNeedToSchedule() ||
21601 "scheduler and vectorizer bundle mismatch");
21602 SD->setSchedulingPriority(Idx++);
21603 if (!SD->hasValidDependencies() &&
21604 (!CopyableData.
empty() ||
21605 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
21606 assert(TE->isGather() &&
"expected gather node");
21607 return TE->hasState() && TE->hasCopyableElements() &&
21608 TE->isCopyableElement(I);
21614 ScheduleBundle Bundle;
21616 BS->calculateDependencies(Bundle,
false,
this);
21619 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
21620 ScheduleBundle &Bundle = SD->getBundle();
21621 Bundle.setSchedulingPriority(Idx++);
21622 if (!Bundle.hasValidDependencies())
21623 BS->calculateDependencies(Bundle,
false,
this);
21626 BS->initialFillReadyList(ReadyInsts);
21628 Instruction *LastScheduledInst = BS->ScheduleEnd;
21631 SmallPtrSet<Instruction *, 16> Scheduled;
21632 while (!ReadyInsts.empty()) {
21633 auto *Picked = *ReadyInsts.begin();
21634 ReadyInsts.erase(ReadyInsts.begin());
21639 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21640 Instruction *PickedInst = BundleMember->getInst();
21642 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21643 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21644 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
21646 if (PickedInst->
getNextNode() != LastScheduledInst)
21648 LastScheduledInst = PickedInst;
21650 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21651 LastScheduledInst);
21655 if (PickedInst->
getNextNode() != LastScheduledInst)
21657 LastScheduledInst = PickedInst;
21659 auto Invalid = InstructionsState::invalid();
21664#ifdef EXPENSIVE_CHECKS
21668#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21670 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21671 I =
I->getNextNode()) {
21674 [](
const ScheduleBundle *Bundle) {
21675 return Bundle->isScheduled();
21677 "must be scheduled at this point");
21682 BS->ScheduleStart =
nullptr;
21690 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21695 auto E = InstrElementSize.find(V);
21696 if (E != InstrElementSize.end())
21713 Value *FirstNonBool =
nullptr;
21714 while (!Worklist.
empty()) {
21719 auto *Ty =
I->getType();
21722 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21730 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21738 for (
Use &U :
I->operands()) {
21740 if (Visited.
insert(J).second &&
21746 FirstNonBool = U.get();
21757 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21759 Width = DL->getTypeSizeInBits(V->getType());
21763 InstrElementSize[
I] = Width;
21768bool BoUpSLP::collectValuesToDemote(
21769 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
21772 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
21777 unsigned OrigBitWidth =
21778 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21785 if (NodesToKeepBWs.
contains(E.Idx))
21791 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
21792 if (isa<PoisonValue>(R))
21794 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21796 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
21799 if (getTreeEntries(V).
size() > 1)
21805 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
21811 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21816 unsigned BitWidth2 =
21817 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21818 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21824 BitWidth1 = std::min(BitWidth1, BitWidth2);
21829 auto FinalAnalysis = [&, TTI = TTI]() {
21830 if (!IsProfitableToDemote)
21833 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
21835 if (Res &&
E.isGather()) {
21836 if (
E.hasState()) {
21837 if (
const TreeEntry *SameTE =
21838 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
21840 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
21841 ToDemote, Visited, NodesToKeepBWs,
21842 MaxDepthLevel, IsProfitableToDemote,
21850 SmallPtrSet<Value *, 4> UniqueBases;
21851 for (
Value *V :
E.Scalars) {
21855 UniqueBases.
insert(EE->getVectorOperand());
21857 const unsigned VF =
E.Scalars.size();
21858 Type *OrigScalarTy =
E.Scalars.front()->getType();
21859 if (UniqueBases.
size() <= 2 ||
21872 if (
E.isGather() || !Visited.
insert(&
E).second ||
21874 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21875 return isa<InsertElementInst>(U) && !isVectorized(U);
21878 return FinalAnalysis();
21881 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21882 return isVectorized(U) ||
21883 (E.Idx == 0 && UserIgnoreList &&
21884 UserIgnoreList->contains(U)) ||
21885 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21886 !U->getType()->isScalableTy() &&
21887 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21888 }) && !IsPotentiallyTruncated(V,
BitWidth);
21893 bool &NeedToExit) {
21894 NeedToExit =
false;
21895 unsigned InitLevel = MaxDepthLevel;
21897 unsigned Level = InitLevel;
21898 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
21899 ToDemote, Visited, NodesToKeepBWs, Level,
21900 IsProfitableToDemote, IsTruncRoot)) {
21901 if (!IsProfitableToDemote)
21904 if (!FinalAnalysis())
21908 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21912 auto AttemptCheckBitwidth =
21913 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
21915 NeedToExit =
false;
21916 unsigned BestFailBitwidth = 0;
21918 if (Checker(
BitWidth, OrigBitWidth))
21920 if (BestFailBitwidth == 0 && FinalAnalysis())
21924 if (BestFailBitwidth == 0) {
21935 auto TryProcessInstruction =
21937 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
21941 for (
Value *V :
E.Scalars)
21942 (void)IsPotentiallyTruncated(V,
BitWidth);
21947 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
21950 bool NeedToExit =
false;
21951 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21955 if (!ProcessOperands(
Operands, NeedToExit))
21964 return IsProfitableToDemote;
21967 if (
E.State == TreeEntry::SplitVectorize)
21968 return TryProcessInstruction(
21970 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
21971 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
21973 switch (
E.getOpcode()) {
21977 case Instruction::Trunc:
21978 if (IsProfitableToDemoteRoot)
21979 IsProfitableToDemote =
true;
21980 return TryProcessInstruction(
BitWidth);
21981 case Instruction::ZExt:
21982 case Instruction::SExt:
21983 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
21984 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
21985 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
21987 IsProfitableToDemote =
true;
21988 return TryProcessInstruction(
BitWidth);
21992 case Instruction::Add:
21993 case Instruction::Sub:
21994 case Instruction::Mul:
21995 case Instruction::And:
21996 case Instruction::Or:
21997 case Instruction::Xor: {
21998 return TryProcessInstruction(
21999 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
22001 case Instruction::Freeze:
22002 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
22003 case Instruction::Shl: {
22006 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
22008 if (isa<PoisonValue>(V))
22010 auto *I = cast<Instruction>(V);
22011 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22012 return AmtKnownBits.getMaxValue().ult(BitWidth);
22015 return TryProcessInstruction(
22016 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
22018 case Instruction::LShr: {
22022 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22024 if (isa<PoisonValue>(V))
22026 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22027 if (E.isCopyableElement(V))
22028 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22029 auto *I = cast<Instruction>(V);
22030 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22031 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22032 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22033 SimplifyQuery(*DL));
22036 return TryProcessInstruction(
22037 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22040 case Instruction::AShr: {
22044 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22046 if (isa<PoisonValue>(V))
22048 auto *I = cast<Instruction>(V);
22049 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22050 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22051 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22053 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22056 return TryProcessInstruction(
22057 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22060 case Instruction::UDiv:
22061 case Instruction::URem: {
22063 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22066 auto *I = cast<Instruction>(V);
22067 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22068 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22069 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22072 return TryProcessInstruction(
22073 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
22077 case Instruction::Select: {
22078 return TryProcessInstruction(
22079 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
22083 case Instruction::PHI: {
22084 const unsigned NumOps =
E.getNumOperands();
22087 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
22092 case Instruction::Call: {
22097 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
22098 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
22101 function_ref<bool(
unsigned,
unsigned)> CallChecker;
22102 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22105 auto *I = cast<Instruction>(V);
22106 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22107 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22108 return MaskedValueIsZero(I->getOperand(0), Mask,
22109 SimplifyQuery(*DL)) &&
22110 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22112 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22113 "Expected min/max intrinsics only.");
22114 unsigned SignBits = OrigBitWidth -
BitWidth;
22116 unsigned Op0SignBits =
22118 unsigned Op1SignBits =
22120 return SignBits <= Op0SignBits &&
22121 ((SignBits != Op0SignBits &&
22124 SimplifyQuery(*DL))) &&
22125 SignBits <= Op1SignBits &&
22126 ((SignBits != Op1SignBits &&
22131 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22134 auto *I = cast<Instruction>(V);
22135 unsigned SignBits = OrigBitWidth - BitWidth;
22136 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22137 unsigned Op0SignBits =
22138 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22139 return SignBits <= Op0SignBits &&
22140 ((SignBits != Op0SignBits &&
22141 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22142 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22145 if (
ID != Intrinsic::abs) {
22146 Operands.push_back(getOperandEntry(&
E, 1));
22147 CallChecker = CompChecker;
22149 CallChecker = AbsChecker;
22152 std::numeric_limits<InstructionCost::CostType>::max();
22154 unsigned VF =
E.Scalars.size();
22156 auto Checker = [&](
unsigned BitWidth, unsigned) {
22164 if (
Cost < BestCost) {
22170 [[maybe_unused]]
bool NeedToExit;
22171 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22181 return FinalAnalysis();
22188 bool IsStoreOrInsertElt =
22189 VectorizableTree.front()->hasState() &&
22190 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
22191 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22192 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22193 ExtraBitWidthNodes.size() <= 1 &&
22194 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22195 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22198 unsigned NodeIdx = 0;
22199 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22203 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22204 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22205 "Unexpected tree is graph.");
22209 bool IsTruncRoot =
false;
22210 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22213 if (NodeIdx != 0 &&
22214 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22215 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22216 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22217 IsTruncRoot =
true;
22219 IsProfitableToDemoteRoot =
true;
22224 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22228 auto ComputeMaxBitWidth =
22229 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22230 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22234 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22235 !NodesToKeepBWs.
contains(E.Idx) &&
22236 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22238 return V->hasOneUse() || isa<Constant>(V) ||
22239 (!V->hasNUsesOrMore(UsesLimit) &&
22240 none_of(V->users(), [&](User *U) {
22241 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22242 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22243 if (TEs.empty() || is_contained(TEs, UserTE))
22245 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22247 isa<SIToFPInst, UIToFPInst>(U) ||
22248 (UserTE->hasState() &&
22249 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22250 SelectInst>(UserTE->getMainOp()) ||
22251 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22253 unsigned UserTESz = DL->getTypeSizeInBits(
22254 UserTE->Scalars.front()->getType());
22255 if (all_of(TEs, [&](const TreeEntry *TE) {
22256 auto It = MinBWs.find(TE);
22257 return It != MinBWs.end() &&
22258 It->second.first > UserTESz;
22261 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22265 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22266 auto It = MinBWs.find(UserTE);
22267 if (It != MinBWs.end())
22268 return It->second.first;
22269 unsigned MaxBitWidth =
22270 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22271 MaxBitWidth =
bit_ceil(MaxBitWidth);
22272 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22274 return MaxBitWidth;
22280 unsigned VF = E.getVectorFactor();
22281 Type *ScalarTy = E.Scalars.front()->getType();
22288 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22297 unsigned MaxBitWidth = 1u;
22305 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22306 if (isa<PoisonValue>(R))
22308 KnownBits Known = computeKnownBits(R, *DL);
22309 return Known.isNonNegative();
22312 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22313 E.UserTreeIndex.UserTE->hasState() &&
22314 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22316 std::min(DL->getTypeSizeInBits(
22317 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22318 DL->getTypeSizeInBits(ScalarTy));
22322 for (
Value *Root : E.Scalars) {
22328 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22344 if (!IsKnownPositive)
22349 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22352 APInt Mask = DB->getDemandedBits(
I);
22353 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22355 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22358 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22363 if (NumParts > 1 &&
22371 unsigned Opcode = E.getOpcode();
22372 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22373 Opcode == Instruction::SExt ||
22374 Opcode == Instruction::ZExt || NumParts > 1;
22379 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22380 bool NeedToDemote = IsProfitableToDemote;
22382 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22383 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22384 NeedToDemote, IsTruncRoot) ||
22385 (MaxDepthLevel <= Limit &&
22386 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22387 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22388 DL->getTypeSizeInBits(TreeRootIT) /
22389 DL->getTypeSizeInBits(
22390 E.getMainOp()->getOperand(0)->getType()) >
22394 MaxBitWidth =
bit_ceil(MaxBitWidth);
22396 return MaxBitWidth;
22403 if (UserIgnoreList &&
22407 if (
all_of(*UserIgnoreList,
22412 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22413 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22414 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22415 Builder.getInt1Ty()) {
22416 ReductionBitWidth = 1;
22418 for (
Value *V : *UserIgnoreList) {
22422 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
22423 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22426 unsigned BitWidth2 = BitWidth1;
22429 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22431 ReductionBitWidth =
22432 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22434 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22435 ReductionBitWidth = 8;
22437 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22440 bool IsTopRoot = NodeIdx == 0;
22441 while (NodeIdx < VectorizableTree.size() &&
22442 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22443 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22444 RootDemotes.push_back(NodeIdx);
22446 IsTruncRoot =
true;
22448 bool IsSignedCmp =
false;
22449 if (UserIgnoreList &&
22453 IsSignedCmp =
true;
22454 while (NodeIdx < VectorizableTree.size()) {
22456 unsigned Limit = 2;
22458 ReductionBitWidth ==
22459 DL->getTypeSizeInBits(
22460 VectorizableTree.front()->Scalars.front()->getType()))
22462 unsigned MaxBitWidth = ComputeMaxBitWidth(
22463 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22464 IsTruncRoot, IsSignedCmp);
22465 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22466 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22467 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22468 else if (MaxBitWidth == 0)
22469 ReductionBitWidth = 0;
22472 for (
unsigned Idx : RootDemotes) {
22473 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
22474 uint32_t OrigBitWidth =
22475 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22476 if (OrigBitWidth > MaxBitWidth) {
22484 RootDemotes.clear();
22486 IsProfitableToDemoteRoot =
true;
22488 if (ExtraBitWidthNodes.empty()) {
22489 NodeIdx = VectorizableTree.size();
22491 unsigned NewIdx = 0;
22493 NewIdx = *ExtraBitWidthNodes.begin();
22494 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22495 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22498 NodeIdx < VectorizableTree.size() &&
22499 VectorizableTree[NodeIdx]->UserTreeIndex &&
22500 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22501 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22502 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22503 Instruction::Trunc &&
22504 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22506 NodeIdx < VectorizableTree.size() &&
22507 VectorizableTree[NodeIdx]->UserTreeIndex &&
22508 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22509 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22510 Instruction::ICmp &&
22512 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22514 auto *IC = dyn_cast<ICmpInst>(V);
22515 return IC && (IC->isSigned() ||
22516 !isKnownNonNegative(IC->getOperand(0),
22517 SimplifyQuery(*DL)) ||
22518 !isKnownNonNegative(IC->getOperand(1),
22519 SimplifyQuery(*DL)));
22525 if (MaxBitWidth == 0 ||
22529 if (UserIgnoreList)
22530 AnalyzedMinBWVals.insert_range(TreeRoot);
22537 for (
unsigned Idx : ToDemote) {
22538 TreeEntry *
TE = VectorizableTree[Idx].get();
22539 if (MinBWs.contains(TE))
22542 if (isa<PoisonValue>(R))
22544 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22546 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22587 DL = &
F.getDataLayout();
22595 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
22597 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
22602 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
22605 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
22609 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
22615 DT->updateDFSNumbers();
22618 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
22623 R.clearReductionData();
22624 collectSeedInstructions(BB);
22627 if (!Stores.empty()) {
22629 <<
" underlying objects.\n");
22630 Changed |= vectorizeStoreChains(R);
22634 Changed |= vectorizeChainsInBlock(BB, R);
22639 if (!GEPs.empty()) {
22641 <<
" underlying objects.\n");
22642 Changed |= vectorizeGEPIndices(BB, R);
22647 R.optimizeGatherSequence();
22655 unsigned Idx,
unsigned MinVF,
22660 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22661 unsigned VF = Chain.
size();
22667 VF < 2 || VF < MinVF) {
22675 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
22679 for (
Value *V : Chain)
22682 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
22683 InstructionsState S =
Analysis.buildInstructionsState(
22687 bool IsAllowedSize =
22691 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22692 (!S.getMainOp()->isSafeToRemove() ||
22695 return !isa<ExtractElementInst>(V) &&
22696 (V->getNumUses() > Chain.size() ||
22697 any_of(V->users(), [&](User *U) {
22698 return !Stores.contains(U);
22701 (ValOps.
size() > Chain.size() / 2 && !S)) {
22702 Size = (!IsAllowedSize && S) ? 1 : 2;
22706 if (
R.isLoadCombineCandidate(Chain))
22708 R.buildTree(Chain);
22710 if (
R.isTreeTinyAndNotFullyVectorizable()) {
22711 if (
R.isGathered(Chain.front()) ||
22713 return std::nullopt;
22714 Size =
R.getCanonicalGraphSize();
22717 if (
R.isProfitableToReorder()) {
22718 R.reorderTopToBottom();
22719 R.reorderBottomToTop();
22721 R.transformNodes();
22722 R.buildExternalUses();
22724 R.computeMinimumValueSizes();
22726 Size =
R.getCanonicalGraphSize();
22727 if (S && S.getOpcode() == Instruction::Load)
22735 using namespace ore;
22737 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
22739 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
22740 <<
" and with tree size "
22741 <<
NV(
"TreeSize",
R.getTreeSize()));
22755 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22756 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22757 unsigned Size = First ? Val.first : Val.second;
22769 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22770 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22771 unsigned P = First ? Val.first : Val.second;
22774 return V + (P - Mean) * (P - Mean);
22777 return Dev * 96 / (Mean * Mean) == 0;
22785class RelatedStoreInsts {
22788 : AllStores(AllStores) {
22789 reset(BaseInstrIdx);
22792 void reset(
unsigned NewBaseInstr) {
22793 assert(NewBaseInstr < AllStores.size() &&
22794 "Instruction index out of bounds");
22795 BaseInstrIdx = NewBaseInstr;
22797 insertOrLookup(NewBaseInstr, 0);
22804 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
22805 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22806 return Inserted ? std::nullopt : std::make_optional(It->second);
22809 using DistToInstMap = std::map<int64_t, unsigned>;
22810 const DistToInstMap &getStores()
const {
return Instrs; }
22814 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
22815 ScalarEvolution &SE)
const {
22816 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22819 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
22825 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
22826 int64_t DistFromCurBase) {
22827 DistToInstMap PrevSet = std::move(Instrs);
22828 reset(NewBaseInstIdx);
22833 for (
auto [Dist, InstIdx] : PrevSet) {
22834 if (InstIdx >= MinSafeIdx)
22835 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22841 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
22842 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
22843 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
22848 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22849 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22854 unsigned BaseInstrIdx;
22857 DistToInstMap Instrs;
22865bool SLPVectorizerPass::vectorizeStores(
22867 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22874 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22875 int64_t PrevDist = -1;
22879 auto &[Dist, InstIdx] =
Data;
22880 if (
Operands.empty() || Dist - PrevDist == 1) {
22881 Operands.push_back(Stores[InstIdx]);
22883 if (Idx != StoreSeq.size() - 1)
22888 Operands.push_back(Stores[InstIdx]);
22894 .
insert({Operands.front(),
22895 cast<StoreInst>(Operands.front())->getValueOperand(),
22897 cast<StoreInst>(Operands.back())->getValueOperand(),
22902 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
22903 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
22907 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22909 Type *StoreTy =
Store->getValueOperand()->getType();
22910 Type *ValueTy = StoreTy;
22912 ValueTy = Trunc->getSrcTy();
22921 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22924 MinVF = std::max<unsigned>(2, MinVF);
22926 if (MaxVF < MinVF) {
22927 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22929 <<
"MinVF (" << MinVF <<
")\n");
22933 unsigned NonPowerOf2VF = 0;
22938 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
22940 NonPowerOf2VF = CandVF;
22941 assert(NonPowerOf2VF != MaxVF &&
22942 "Non-power-of-2 VF should not be equal to MaxVF");
22949 unsigned MaxRegVF = MaxVF;
22952 if (MaxVF < MinVF) {
22953 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22955 <<
"MinVF (" << MinVF <<
")\n");
22959 SmallVector<unsigned> CandidateVFs;
22960 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
22965 unsigned Repeat = 0;
22966 constexpr unsigned MaxAttempts = 4;
22967 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(
Operands.size());
22968 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
22969 P.first =
P.second = 1;
22970 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
22971 auto IsNotVectorized = [](
bool First,
22972 const std::pair<unsigned, unsigned> &
P) {
22973 return First ?
P.first > 0 :
P.second > 0;
22975 auto IsVectorized = [](
bool First,
22976 const std::pair<unsigned, unsigned> &
P) {
22977 return First ?
P.first == 0 :
P.second == 0;
22979 auto VFIsProfitable = [](
bool First,
unsigned Size,
22980 const std::pair<unsigned, unsigned> &
P) {
22983 auto FirstSizeSame = [](
unsigned Size,
22984 const std::pair<unsigned, unsigned> &
P) {
22985 return Size ==
P.first;
22989 bool RepeatChanged =
false;
22990 bool AnyProfitableGraph =
false;
22991 for (
unsigned VF : CandidateVFs) {
22992 AnyProfitableGraph =
false;
22993 unsigned FirstUnvecStore =
22994 std::distance(RangeSizes.begin(),
22995 find_if(RangeSizes, std::bind(IsNotVectorized,
22996 VF >= MaxRegVF, _1)));
23000 while (FirstUnvecStore < End) {
23001 unsigned FirstVecStore = std::distance(
23002 RangeSizes.begin(),
23003 find_if(RangeSizes.drop_front(FirstUnvecStore),
23004 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23005 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23006 for (
unsigned SliceStartIdx = FirstUnvecStore;
23007 SliceStartIdx + VF <= MaxSliceEnd;) {
23018 ->getValueOperand()
23021 ->getValueOperand()
23024 "Expected all operands of same type.");
23025 if (!NonSchedulable.
empty()) {
23026 auto [NonSchedSizeMax, NonSchedSizeMin] =
23028 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23031 SliceStartIdx += NonSchedSizeMax;
23036 std::optional<bool> Res =
23037 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23043 .first->getSecond()
23051 AnyProfitableGraph = RepeatChanged =
Changed =
true;
23054 for (std::pair<unsigned, unsigned> &
P :
23055 RangeSizes.slice(SliceStartIdx, VF))
23056 P.first =
P.second = 0;
23057 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23058 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
23059 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23060 P.first =
P.second = 0;
23061 FirstUnvecStore = SliceStartIdx + VF;
23063 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23064 for (std::pair<unsigned, unsigned> &
P :
23065 RangeSizes.slice(SliceStartIdx + VF,
23066 MaxSliceEnd - (SliceStartIdx + VF)))
23067 P.first =
P.second = 0;
23068 if (MaxSliceEnd == End)
23069 End = SliceStartIdx;
23070 MaxSliceEnd = SliceStartIdx;
23072 SliceStartIdx += VF;
23075 if (VF > 2 && Res &&
23076 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
23077 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23079 SliceStartIdx += VF;
23084 if (VF > MaxRegVF && TreeSize > 1 &&
23085 all_of(RangeSizes.slice(SliceStartIdx, VF),
23086 std::bind(FirstSizeSame, TreeSize, _1))) {
23087 SliceStartIdx += VF;
23088 while (SliceStartIdx != MaxSliceEnd &&
23089 RangeSizes[SliceStartIdx].first == TreeSize)
23093 if (TreeSize > 1) {
23094 for (std::pair<unsigned, unsigned> &
P :
23095 RangeSizes.slice(SliceStartIdx, VF)) {
23096 if (VF >= MaxRegVF)
23097 P.second = std::max(
P.second, TreeSize);
23099 P.first = std::max(
P.first, TreeSize);
23103 AnyProfitableGraph =
true;
23105 if (FirstUnvecStore >= End)
23107 if (MaxSliceEnd - FirstUnvecStore < VF &&
23108 MaxSliceEnd - FirstUnvecStore >= MinVF)
23109 AnyProfitableGraph =
true;
23110 FirstUnvecStore = std::distance(
23111 RangeSizes.begin(),
23112 find_if(RangeSizes.drop_front(MaxSliceEnd),
23113 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23115 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23119 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23120 return P.first == 0 &&
P.second == 0;
23124 if (Repeat >= MaxAttempts ||
23125 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23127 constexpr unsigned StoresLimit = 64;
23128 const unsigned MaxTotalNum = std::min<unsigned>(
23130 static_cast<unsigned>(
23133 RangeSizes.begin(),
23134 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23136 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23139 CandidateVFs.clear();
23141 CandidateVFs.push_back(Limit);
23142 if (VF > MaxTotalNum || VF >= StoresLimit)
23144 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23146 P.first = std::max(
P.second,
P.first);
23150 CandidateVFs.push_back(VF);
23190 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
23191 std::optional<int64_t> PtrDist;
23192 auto *RelatedStores =
find_if(
23193 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23194 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23195 return PtrDist.has_value();
23199 if (RelatedStores == SortedStores.
end()) {
23207 if (std::optional<unsigned> PrevInst =
23208 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23209 TryToVectorize(RelatedStores->getStores());
23210 RelatedStores->clearVectorizedStores(VectorizedStores);
23211 RelatedStores->rebase(*PrevInst + 1,
23216 Type *PrevValTy =
nullptr;
23218 if (
R.isDeleted(SI))
23221 PrevValTy =
SI->getValueOperand()->getType();
23223 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23224 for (RelatedStoreInsts &StoreSeq : SortedStores)
23225 TryToVectorize(StoreSeq.getStores());
23226 SortedStores.clear();
23227 PrevValTy =
SI->getValueOperand()->getType();
23229 FillStoresSet(
I, SI);
23233 for (RelatedStoreInsts &StoreSeq : SortedStores)
23234 TryToVectorize(StoreSeq.getStores());
23239void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23247 for (Instruction &
I : *BB) {
23251 if (!
SI->isSimple())
23262 if (
GEP->getNumIndices() != 1)
23264 Value *Idx =
GEP->idx_begin()->get();
23269 if (
GEP->getType()->isVectorTy())
23281 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23282 << VL.
size() <<
".\n");
23293 for (
Value *V : VL) {
23294 Type *Ty =
V->getType();
23298 R.getORE()->emit([&]() {
23299 std::string TypeStr;
23300 llvm::raw_string_ostream OS(TypeStr);
23302 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
23303 <<
"Cannot SLP vectorize list: type "
23304 << TypeStr +
" is unsupported by vectorizer";
23311 unsigned Sz =
R.getVectorElementSize(I0);
23312 unsigned MinVF =
R.getMinVF(Sz);
23313 unsigned MaxVF = std::max<unsigned>(
23315 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23317 R.getORE()->emit([&]() {
23318 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
23319 <<
"Cannot SLP vectorize list: vectorization factor "
23320 <<
"less than 2 is not supported";
23326 bool CandidateFound =
false;
23329 unsigned NextInst = 0, MaxInst = VL.size();
23330 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23336 if (TTI->getNumberOfParts(VecTy) == VF)
23338 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23339 unsigned ActualVF = std::min(MaxInst -
I, VF);
23344 if (MaxVFOnly && ActualVF < MaxVF)
23346 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23351 for (
Value *V : VL.drop_front(
I)) {
23355 !Inst || !
R.isDeleted(Inst)) {
23358 if (Idx == ActualVF)
23363 if (Idx != ActualVF)
23366 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23370 if (
R.isTreeTinyAndNotFullyVectorizable())
23372 if (
R.isProfitableToReorder()) {
23373 R.reorderTopToBottom();
23376 R.transformNodes();
23377 R.buildExternalUses();
23379 R.computeMinimumValueSizes();
23381 CandidateFound =
true;
23382 MinCost = std::min(MinCost,
Cost);
23385 <<
" for VF=" << ActualVF <<
"\n");
23388 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
23390 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23391 <<
" and with tree size "
23392 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23403 if (!
Changed && CandidateFound) {
23404 R.getORE()->emit([&]() {
23405 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
23406 <<
"List vectorization was possible but not beneficial with cost "
23407 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23411 R.getORE()->emit([&]() {
23412 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
23413 <<
"Cannot SLP vectorize list: vectorization was impossible"
23414 <<
" with available vectorization factors";
23449 using ReductionOpsType = SmallVector<Value *, 16>;
23450 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23451 ReductionOpsListType ReductionOps;
23455 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23456 WeakTrackingVH ReductionRoot;
23461 bool IsSupportedHorRdxIdentityOp =
false;
23468 static bool isCmpSelMinMax(Instruction *
I) {
23476 static bool isBoolLogicOp(Instruction *
I) {
23482 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
23483 bool TwoElementReduction =
false) {
23484 if (Kind == RecurKind::None)
23493 if (TwoElementReduction)
23496 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23500 return I->getFastMathFlags().noNaNs();
23503 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23506 return I->isAssociative();
23509 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
23515 return I->getOperand(2);
23516 return I->getOperand(Index);
23521 Value *
RHS,
const Twine &Name,
bool UseSelect) {
23525 case RecurKind::Or: {
23534 case RecurKind::And: {
23543 case RecurKind::Add:
23544 case RecurKind::Mul:
23545 case RecurKind::Xor:
23546 case RecurKind::FAdd:
23547 case RecurKind::FMul: {
23552 case RecurKind::SMax:
23553 case RecurKind::SMin:
23554 case RecurKind::UMax:
23555 case RecurKind::UMin:
23562 case RecurKind::FMax:
23563 case RecurKind::FMin:
23564 case RecurKind::FMaximum:
23565 case RecurKind::FMinimum:
23566 case RecurKind::FMaximumNum:
23567 case RecurKind::FMinimumNum: {
23580 const ReductionOpsListType &ReductionOps) {
23581 bool UseSelect = ReductionOps.size() == 2 ||
23583 (ReductionOps.size() == 1 &&
23585 assert((!UseSelect || ReductionOps.size() != 2 ||
23587 "Expected cmp + select pairs for reduction");
23588 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
23606 return RecurKind::None;
23608 return RecurKind::Add;
23610 return RecurKind::Mul;
23613 return RecurKind::And;
23616 return RecurKind::Or;
23618 return RecurKind::Xor;
23620 return RecurKind::FAdd;
23622 return RecurKind::FMul;
23625 return RecurKind::FMax;
23627 return RecurKind::FMin;
23630 return RecurKind::FMaximum;
23632 return RecurKind::FMinimum;
23638 return RecurKind::SMax;
23640 return RecurKind::SMin;
23642 return RecurKind::UMax;
23644 return RecurKind::UMin;
23670 return RecurKind::None;
23674 return RecurKind::None;
23677 return RecurKind::None;
23681 return RecurKind::None;
23686 return RecurKind::None;
23689 return RecurKind::SMax;
23692 return RecurKind::SMin;
23695 return RecurKind::UMax;
23698 return RecurKind::UMin;
23701 return RecurKind::None;
23705 static unsigned getFirstOperandIndex(Instruction *
I) {
23706 return isCmpSelMinMax(
I) ? 1 : 0;
23711 static unsigned getNumberOfOperands(Instruction *
I) {
23712 return isCmpSelMinMax(
I) ? 3 : 2;
23717 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
23718 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
23721 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
23723 return I->getParent() == BB;
23727 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
23728 if (IsCmpSelMinMax) {
23732 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23733 return I->hasNUses(2);
23741 void initReductionOps(Instruction *
I) {
23742 if (isCmpSelMinMax(
I))
23743 ReductionOps.assign(2, ReductionOpsType());
23745 ReductionOps.assign(1, ReductionOpsType());
23749 void addReductionOps(Instruction *
I) {
23750 if (isCmpSelMinMax(
I)) {
23752 ReductionOps[1].emplace_back(
I);
23754 ReductionOps[0].emplace_back(
I);
23759 int Sz =
Data.size();
23768 : ReductionRoot(
I), ReductionLimit(2) {
23769 RdxKind = HorizontalReduction::getRdxKind(
I);
23770 ReductionOps.emplace_back().push_back(
I);
23773 ReducedValsToOps[
V].push_back(
I);
23776 bool matchReductionForOperands()
const {
23779 assert(ReductionRoot &&
"Reduction root is not set!");
23782 return Ops.size() == 2;
23790 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23791 ScalarEvolution &SE,
const DataLayout &
DL,
23792 const TargetLibraryInfo &TLI) {
23793 RdxKind = HorizontalReduction::getRdxKind(Root);
23794 if (!isVectorizable(RdxKind, Root))
23806 if (!Sel->getCondition()->hasOneUse())
23809 ReductionRoot = Root;
23814 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23816 1, std::make_pair(Root, 0));
23821 SmallVectorImpl<Value *> &PossibleReducedVals,
23822 SmallVectorImpl<Instruction *> &ReductionOps,
23825 getNumberOfOperands(TreeN)))) {
23826 Value *EdgeVal = getRdxOperand(TreeN,
I);
23827 ReducedValsToOps[EdgeVal].push_back(TreeN);
23835 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23836 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23837 !isVectorizable(RdxKind, EdgeInst) ||
23838 (
R.isAnalyzedReductionRoot(EdgeInst) &&
23840 PossibleReducedVals.push_back(EdgeVal);
23843 ReductionOps.push_back(EdgeInst);
23852 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
23854 PossibleReducedVals;
23855 initReductionOps(Root);
23857 SmallSet<size_t, 2> LoadKeyUsed;
23859 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
23864 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
23865 if (LIt != LoadsMap.
end()) {
23866 for (LoadInst *RLI : LIt->second) {
23872 for (LoadInst *RLI : LIt->second) {
23879 if (LIt->second.size() > 2) {
23881 hash_value(LIt->second.back()->getPointerOperand());
23887 .first->second.push_back(LI);
23891 while (!Worklist.empty()) {
23892 auto [TreeN,
Level] = Worklist.pop_back_val();
23895 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23896 addReductionOps(TreeN);
23899 for (
Value *V : PossibleRedVals) {
23903 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
23905 for (Instruction *
I :
reverse(PossibleReductionOps))
23906 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
23908 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
23911 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
23912 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
23914 for (
auto &Slice : PossibleRedVals) {
23916 auto RedValsVect = Slice.second.takeVector();
23918 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
23919 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
23921 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
23922 return P1.size() > P2.size();
23929 }
else if (!isGoodForReduction(
Data)) {
23932 if (!LI || !LastLI ||
23937 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
23943 return P1.size() > P2.
size();
23949 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
23950 const TargetLibraryInfo &TLI, AssumptionCache *AC,
23951 DominatorTree &DT) {
23952 constexpr unsigned RegMaxNumber = 4;
23953 constexpr unsigned RedValsMaxNumber = 128;
23957 if (
unsigned NumReducedVals = std::accumulate(
23958 ReducedVals.
begin(), ReducedVals.
end(), 0,
23960 if (!isGoodForReduction(Vals))
23962 return Num + Vals.size();
23964 NumReducedVals < ReductionLimit &&
23968 for (ReductionOpsType &RdxOps : ReductionOps)
23969 for (
Value *RdxOp : RdxOps)
23974 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
23980 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
23981 ReducedVals.
front().size());
23985 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
23987 "Expected min/max reduction to have select root instruction");
23990 "Expected min/max reduction to have compare condition");
23994 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
23995 return isBoolLogicOp(cast<Instruction>(V));
23998 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
23999 if (VectorizedTree) {
24003 if (AnyBoolLogicOp) {
24004 auto It = ReducedValsToOps.
find(VectorizedTree);
24005 auto It1 = ReducedValsToOps.
find(Res);
24006 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
24008 (It != ReducedValsToOps.
end() &&
24009 any_of(It->getSecond(), [&](Instruction *
I) {
24010 return isBoolLogicOp(I) &&
24011 getRdxOperand(I, 0) == VectorizedTree;
24015 (It1 != ReducedValsToOps.
end() &&
24016 any_of(It1->getSecond(), [&](Instruction *
I) {
24017 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24021 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
24025 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
24031 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24032 ReductionOps.front().size());
24033 for (ReductionOpsType &RdxOps : ReductionOps)
24034 for (
Value *RdxOp : RdxOps) {
24037 IgnoreList.insert(RdxOp);
24040 FastMathFlags RdxFMF;
24042 for (
Value *U : IgnoreList)
24044 RdxFMF &= FPMO->getFastMathFlags();
24050 for (
Value *V : Candidates)
24051 TrackedVals.try_emplace(V, V);
24053 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24054 Value *
V) ->
unsigned & {
24055 auto *It = MV.
find(V);
24056 assert(It != MV.
end() &&
"Unable to find given key.");
24060 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
24063 SmallPtrSet<Value *, 4> RequiredExtract;
24064 WeakTrackingVH VectorizedTree =
nullptr;
24065 bool CheckForReusedReductionOps =
false;
24070 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
24072 InstructionsState S = States[
I];
24075 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
24076 for (
Value *ReducedVal : OrigReducedVals) {
24077 Value *RdxVal = TrackedVals.at(ReducedVal);
24084 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24088 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24090 bool ShuffledExtracts =
false;
24092 if (S && S.getOpcode() == Instruction::ExtractElement &&
24093 !S.isAltShuffle() &&
I + 1 <
E) {
24095 for (
Value *RV : ReducedVals[
I + 1]) {
24096 Value *RdxVal = TrackedVals.at(RV);
24103 CommonCandidates.push_back(RdxVal);
24104 TrackedToOrig.try_emplace(RdxVal, RV);
24106 SmallVector<int>
Mask;
24109 Candidates.
swap(CommonCandidates);
24110 ShuffledExtracts =
true;
24117 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24118 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24120 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24121 Value *OrigV = TrackedToOrig.at(VC);
24122 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24124 V.analyzedReductionRoot(ResI);
24126 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24130 unsigned NumReducedVals = Candidates.
size();
24131 if (NumReducedVals < ReductionLimit &&
24132 (NumReducedVals < 2 || !
isSplat(Candidates)))
24137 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24138 RdxKind != RecurKind::FMul &&
24139 RdxKind != RecurKind::FMulAdd;
24141 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24142 if (IsSupportedHorRdxIdentityOp)
24143 for (
Value *V : Candidates) {
24144 Value *OrigV = TrackedToOrig.at(V);
24145 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24157 bool SameScaleFactor =
false;
24158 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24159 SameValuesCounter.
size() != Candidates.size();
24161 if (OptReusedScalars) {
24163 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24164 RdxKind == RecurKind::Xor) &&
24166 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24167 return P.second == SameValuesCounter.
front().second;
24169 Candidates.resize(SameValuesCounter.
size());
24170 transform(SameValuesCounter, Candidates.begin(),
24171 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24172 NumReducedVals = Candidates.size();
24174 if (NumReducedVals == 1) {
24175 Value *OrigV = TrackedToOrig.at(Candidates.front());
24176 unsigned Cnt = At(SameValuesCounter, OrigV);
24178 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24179 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24180 VectorizedVals.try_emplace(OrigV, Cnt);
24181 ExternallyUsedValues.
insert(OrigV);
24186 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24187 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24188 const unsigned MaxElts = std::clamp<unsigned>(
24190 RegMaxNumber * RedValsMaxNumber);
24192 unsigned ReduxWidth = NumReducedVals;
24193 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24194 unsigned NumParts, NumRegs;
24195 Type *ScalarTy = Candidates.front()->getType();
24202 while (NumParts > NumRegs) {
24203 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24204 ReduxWidth =
bit_floor(ReduxWidth - 1);
24210 if (NumParts > NumRegs / 2)
24215 ReduxWidth = GetVectorFactor(ReduxWidth);
24216 ReduxWidth = std::min(ReduxWidth, MaxElts);
24218 unsigned Start = 0;
24219 unsigned Pos =
Start;
24221 unsigned PrevReduxWidth = ReduxWidth;
24222 bool CheckForReusedReductionOpsLocal =
false;
24223 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24224 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24225 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24228 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24231 if (Pos < NumReducedVals - ReduxWidth + 1)
24232 return IsAnyRedOpGathered;
24235 if (ReduxWidth > 1)
24236 ReduxWidth = GetVectorFactor(ReduxWidth);
24237 return IsAnyRedOpGathered;
24239 bool AnyVectorized =
false;
24240 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24241 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24242 ReduxWidth >= ReductionLimit) {
24245 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24247 CheckForReusedReductionOps =
true;
24250 PrevReduxWidth = ReduxWidth;
24253 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24256 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24258 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24260 V.areAnalyzedReductionVals(VL)) {
24261 (void)AdjustReducedVals(
true);
24268 return RedValI &&
V.isDeleted(RedValI);
24271 V.buildTree(VL, IgnoreList);
24272 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24273 if (!AdjustReducedVals())
24274 V.analyzedReductionVals(VL);
24277 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24278 if (!AdjustReducedVals())
24279 V.analyzedReductionVals(VL);
24282 V.reorderTopToBottom();
24285 VL.front()->getType()->isIntOrIntVectorTy() ||
24286 ReductionLimit > 2);
24290 ExternallyUsedValues);
24294 LocalExternallyUsedValues.insert(ReductionRoot);
24295 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24296 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24298 for (
Value *V : ReducedVals[Cnt])
24300 LocalExternallyUsedValues.insert(TrackedVals[V]);
24302 if (!IsSupportedHorRdxIdentityOp) {
24305 "Reused values counter map is not empty");
24306 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24307 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24309 Value *
V = Candidates[Cnt];
24310 Value *OrigV = TrackedToOrig.at(V);
24311 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24314 V.transformNodes();
24317 SmallPtrSet<Value *, 4> Visited;
24318 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24319 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24321 Value *RdxVal = Candidates[Cnt];
24322 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24323 RdxVal = It->second;
24324 if (!Visited.
insert(RdxVal).second)
24328 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24329 LocalExternallyUsedValues.insert(RdxVal);
24332 Value *OrigV = TrackedToOrig.at(RdxVal);
24334 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24335 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
24336 LocalExternallyUsedValues.insert(RdxVal);
24339 if (!IsSupportedHorRdxIdentityOp)
24340 SameValuesCounter.
clear();
24341 for (
Value *RdxVal : VL)
24342 if (RequiredExtract.
contains(RdxVal))
24343 LocalExternallyUsedValues.insert(RdxVal);
24344 V.buildExternalUses(LocalExternallyUsedValues);
24346 V.computeMinimumValueSizes();
24350 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
24353 <<
" for reduction\n");
24357 V.getORE()->emit([&]() {
24358 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
24359 ReducedValsToOps.
at(VL[0]).front())
24360 <<
"Vectorizing horizontal reduction is possible "
24361 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24362 <<
" and threshold "
24365 if (!AdjustReducedVals()) {
24366 V.analyzedReductionVals(VL);
24368 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24371 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24372 VF >= ReductionLimit;
24374 *
TTI, VL.front()->getType(), VF - 1)) {
24376 V.getCanonicalGraphSize() !=
V.getTreeSize())
24379 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
24386 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24387 <<
Cost <<
". (HorRdx)\n");
24388 V.getORE()->emit([&]() {
24389 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
24390 ReducedValsToOps.
at(VL[0]).front())
24391 <<
"Vectorized horizontal reduction with cost "
24392 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24393 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24402 if (IsCmpSelMinMax)
24403 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24406 Value *VectorizedRoot =
V.vectorizeTree(
24407 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24410 for (
Value *RdxVal : Candidates) {
24411 Value *OrigVal = TrackedToOrig.at(RdxVal);
24412 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24413 if (TransformedRdxVal != RdxVal)
24414 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24423 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24426 if (OptReusedScalars && !SameScaleFactor) {
24427 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24428 SameValuesCounter, TrackedToOrig);
24431 Type *ScalarTy = VL.front()->getType();
24436 OptReusedScalars && SameScaleFactor
24437 ? SameValuesCounter.
front().second
24440 ?
V.isSignedMinBitwidthRootNode()
24444 for (
Value *RdxVal : VL) {
24445 Value *OrigV = TrackedToOrig.at(RdxVal);
24446 if (IsSupportedHorRdxIdentityOp) {
24447 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24450 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24451 if (!
V.isVectorized(RdxVal))
24452 RequiredExtract.
insert(RdxVal);
24456 ReduxWidth = NumReducedVals - Pos;
24457 if (ReduxWidth > 1)
24458 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24459 AnyVectorized =
true;
24461 if (OptReusedScalars && !AnyVectorized) {
24462 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24463 Value *RdxVal = TrackedVals.at(
P.first);
24464 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24465 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24466 VectorizedVals.try_emplace(
P.first,
P.second);
24471 if (!VectorValuesAndScales.
empty())
24472 VectorizedTree = GetNewVectorizedTree(
24474 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24476 if (!VectorizedTree) {
24477 if (!CheckForReusedReductionOps) {
24478 for (ReductionOpsType &RdxOps : ReductionOps)
24479 for (
Value *RdxOp : RdxOps)
24501 auto FixBoolLogicalOps =
24504 if (!AnyBoolLogicOp)
24506 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24507 getRdxOperand(RedOp1, 0) ==
LHS ||
24510 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24511 getRdxOperand(RedOp2, 0) ==
RHS ||
24516 if (
LHS != VectorizedTree)
24524 unsigned Sz = InstVals.
size();
24526 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24529 Value *RdxVal1 = InstVals[
I].second;
24530 Value *StableRdxVal1 = RdxVal1;
24531 auto It1 = TrackedVals.find(RdxVal1);
24532 if (It1 != TrackedVals.end())
24533 StableRdxVal1 = It1->second;
24534 Value *RdxVal2 = InstVals[
I + 1].second;
24535 Value *StableRdxVal2 = RdxVal2;
24536 auto It2 = TrackedVals.find(RdxVal2);
24537 if (It2 != TrackedVals.end())
24538 StableRdxVal2 = It2->second;
24542 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24544 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24545 StableRdxVal2,
"op.rdx", ReductionOps);
24546 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24549 ExtraReds[Sz / 2] = InstVals.
back();
24555 SmallPtrSet<Value *, 8> Visited;
24557 for (
Value *RdxVal : Candidates) {
24558 if (!Visited.
insert(RdxVal).second)
24560 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24561 for (Instruction *RedOp :
24567 bool InitStep =
true;
24568 while (ExtraReductions.
size() > 1) {
24570 FinalGen(ExtraReductions, InitStep);
24571 ExtraReductions.
swap(NewReds);
24574 VectorizedTree = ExtraReductions.
front().second;
24576 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24583 SmallPtrSet<Value *, 4> IgnoreSet;
24592 for (
auto *U :
Ignore->users()) {
24594 "All users must be either in the reduction ops list.");
24597 if (!
Ignore->use_empty()) {
24599 Ignore->replaceAllUsesWith(
P);
24602 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24604 return VectorizedTree;
24610 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24611 Value *Vec,
unsigned Scale,
bool IsSigned,
24635 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
24638 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
24640 if (Rdx->
getType() != DestTy)
24646 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24653 bool IsCmpSelMinMax, FastMathFlags FMF,
24654 const BoUpSLP &R, DominatorTree &DT,
24655 const DataLayout &
DL,
24656 const TargetLibraryInfo &TLI) {
24658 Type *ScalarTy = ReducedVals.
front()->getType();
24659 unsigned ReduxWidth = ReducedVals.
size();
24660 FixedVectorType *VectorTy =
R.getReductionType();
24665 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
24668 int Cnt = ReducedVals.
size();
24669 for (
Value *RdxVal : ReducedVals) {
24674 Cost += GenCostFn();
24678 for (User *U : RdxVal->
users()) {
24680 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24681 if (RdxKind == RecurKind::FAdd) {
24691 FMACost -= FMulCost;
24693 ScalarCost += FMACost;
24700 ScalarCost = InstructionCost::getInvalid();
24704 Cost += ScalarCost;
24706 Cost += GenCostFn();
24715 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
24717 case RecurKind::Add:
24718 case RecurKind::Mul:
24719 case RecurKind::Or:
24720 case RecurKind::And:
24721 case RecurKind::Xor:
24722 case RecurKind::FAdd:
24723 case RecurKind::FMul: {
24726 if (DoesRequireReductionOp) {
24729 unsigned ScalarTyNumElements = VecTy->getNumElements();
24734 ReducedVals.size()),
24745 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24746 std::make_pair(RedTy,
true));
24747 if (RType == RedTy) {
24752 RdxOpcode, !IsSigned, RedTy,
24758 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24759 std::make_pair(RedTy,
true));
24762 if (RdxKind == RecurKind::FAdd) {
24767 for (
Value *RdxVal : ReducedVals) {
24773 FMF &= FPCI->getFastMathFlags();
24776 if (!
Ops.empty()) {
24781 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24782 {RVecTy, RVecTy, RVecTy}, FMF);
24788 Instruction::FMul, RVecTy,
CostKind);
24790 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
24791 FMACost -= FMulCost;
24795 if (FMACost.isValid())
24796 VectorCost += FMACost;
24800 if (RType != RedTy) {
24801 unsigned Opcode = Instruction::Trunc;
24803 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24809 ScalarCost = EvaluateScalarCost([&]() {
24814 case RecurKind::FMax:
24815 case RecurKind::FMin:
24816 case RecurKind::FMaximum:
24817 case RecurKind::FMinimum:
24818 case RecurKind::SMax:
24819 case RecurKind::SMin:
24820 case RecurKind::UMax:
24821 case RecurKind::UMin: {
24824 if (DoesRequireReductionOp) {
24830 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24831 std::make_pair(RedTy,
true));
24833 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24835 if (RType != RedTy) {
24836 unsigned Opcode = Instruction::Trunc;
24838 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24844 ScalarCost = EvaluateScalarCost([&]() {
24845 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24854 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
24856 <<
" (It is a splitting reduction)\n");
24857 return VectorCost - ScalarCost;
24863 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24865 Value *ReducedSubTree =
nullptr;
24867 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
24868 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
24869 if (ReducedSubTree)
24870 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24871 "op.rdx", ReductionOps);
24873 ReducedSubTree = Rdx;
24875 if (VectorValuesAndScales.
size() == 1) {
24876 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
24877 CreateSingleOp(Vec, Scale, IsSigned);
24878 return ReducedSubTree;
24882 Value *VecRes =
nullptr;
24883 bool VecResSignedness =
false;
24884 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
24890 case RecurKind::Add: {
24891 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
24894 <<
". (HorRdx)\n");
24897 std::iota(std::next(
Mask.begin(), VF *
I),
24898 std::next(
Mask.begin(), VF * (
I + 1)), 0);
24899 ++NumVectorInstructions;
24910 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
24911 <<
". (HorRdx)\n");
24912 ++NumVectorInstructions;
24916 case RecurKind::Xor: {
24919 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
24924 case RecurKind::FAdd: {
24928 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
24929 <<
". (HorRdx)\n");
24930 ++NumVectorInstructions;
24934 case RecurKind::And:
24935 case RecurKind::Or:
24936 case RecurKind::SMax:
24937 case RecurKind::SMin:
24938 case RecurKind::UMax:
24939 case RecurKind::UMin:
24940 case RecurKind::FMax:
24941 case RecurKind::FMin:
24942 case RecurKind::FMaximum:
24943 case RecurKind::FMinimum:
24946 case RecurKind::Sub:
24947 case RecurKind::AddChainWithSubs:
24948 case RecurKind::Mul:
24949 case RecurKind::FMul:
24950 case RecurKind::FMulAdd:
24951 case RecurKind::AnyOf:
24952 case RecurKind::FindFirstIVSMin:
24953 case RecurKind::FindFirstIVUMin:
24954 case RecurKind::FindLastIVSMax:
24955 case RecurKind::FindLastIVUMax:
24956 case RecurKind::FMaxNum:
24957 case RecurKind::FMinNum:
24958 case RecurKind::FMaximumNum:
24959 case RecurKind::FMinimumNum:
24960 case RecurKind::None:
24967 VecResSignedness = IsSigned;
24969 ++NumVectorInstructions;
24970 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
24976 std::iota(
Mask.begin(),
Mask.end(), 0);
24978 if (VecResVF < VecVF) {
24982 if (VecResVF != VecVF) {
24984 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25001 if (VecResVF < VecVF) {
25007 if (VecResVF != VecVF)
25009 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
25010 if (VecResVF != VecVF)
25015 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25016 CreateVecOp(Vec, Scale, IsSigned);
25017 CreateSingleOp(VecRes, 1,
false);
25019 return ReducedSubTree;
25023 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
25024 const TargetTransformInfo *
TTI,
Type *DestTy) {
25025 assert(VectorizedValue &&
"Need to have a vectorized tree node");
25026 assert(RdxKind != RecurKind::FMulAdd &&
25027 "A call to the llvm.fmuladd intrinsic is not handled yet");
25030 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
25031 RdxKind == RecurKind::Add &&
25036 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
25037 ++NumVectorInstructions;
25040 ++NumVectorInstructions;
25045 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
25047 assert(IsSupportedHorRdxIdentityOp &&
25048 "The optimization of matched scalar identity horizontal reductions "
25049 "must be supported.");
25051 return VectorizedValue;
25053 case RecurKind::Add: {
25055 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
25057 << VectorizedValue <<
". (HorRdx)\n");
25058 return Builder.
CreateMul(VectorizedValue, Scale);
25060 case RecurKind::Xor: {
25062 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
25063 <<
". (HorRdx)\n");
25066 return VectorizedValue;
25068 case RecurKind::FAdd: {
25070 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
25072 << VectorizedValue <<
". (HorRdx)\n");
25073 return Builder.
CreateFMul(VectorizedValue, Scale);
25075 case RecurKind::And:
25076 case RecurKind::Or:
25077 case RecurKind::SMax:
25078 case RecurKind::SMin:
25079 case RecurKind::UMax:
25080 case RecurKind::UMin:
25081 case RecurKind::FMax:
25082 case RecurKind::FMin:
25083 case RecurKind::FMaximum:
25084 case RecurKind::FMinimum:
25086 return VectorizedValue;
25087 case RecurKind::Sub:
25088 case RecurKind::AddChainWithSubs:
25089 case RecurKind::Mul:
25090 case RecurKind::FMul:
25091 case RecurKind::FMulAdd:
25092 case RecurKind::AnyOf:
25093 case RecurKind::FindFirstIVSMin:
25094 case RecurKind::FindFirstIVUMin:
25095 case RecurKind::FindLastIVSMax:
25096 case RecurKind::FindLastIVUMax:
25097 case RecurKind::FMaxNum:
25098 case RecurKind::FMinNum:
25099 case RecurKind::FMaximumNum:
25100 case RecurKind::FMinimumNum:
25101 case RecurKind::None:
25110 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25111 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25112 const DenseMap<Value *, Value *> &TrackedToOrig) {
25113 assert(IsSupportedHorRdxIdentityOp &&
25114 "The optimization of matched scalar identity horizontal reductions "
25115 "must be supported.");
25118 if (VTy->getElementType() != VL.
front()->getType()) {
25122 R.isSignedMinBitwidthRootNode());
25125 case RecurKind::Add: {
25128 for (
Value *V : VL) {
25129 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25130 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
25134 << VectorizedValue <<
". (HorRdx)\n");
25135 return Builder.
CreateMul(VectorizedValue, Scale);
25137 case RecurKind::And:
25138 case RecurKind::Or:
25141 <<
". (HorRdx)\n");
25142 return VectorizedValue;
25143 case RecurKind::SMax:
25144 case RecurKind::SMin:
25145 case RecurKind::UMax:
25146 case RecurKind::UMin:
25147 case RecurKind::FMax:
25148 case RecurKind::FMin:
25149 case RecurKind::FMaximum:
25150 case RecurKind::FMinimum:
25153 <<
". (HorRdx)\n");
25154 return VectorizedValue;
25155 case RecurKind::Xor: {
25160 SmallVector<int>
Mask(
25163 std::iota(
Mask.begin(),
Mask.end(), 0);
25164 bool NeedShuffle =
false;
25165 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25167 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25168 if (Cnt % 2 == 0) {
25170 NeedShuffle =
true;
25176 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25180 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
25181 return VectorizedValue;
25183 case RecurKind::FAdd: {
25186 for (
Value *V : VL) {
25187 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25188 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25191 return Builder.
CreateFMul(VectorizedValue, Scale);
25193 case RecurKind::Sub:
25194 case RecurKind::AddChainWithSubs:
25195 case RecurKind::Mul:
25196 case RecurKind::FMul:
25197 case RecurKind::FMulAdd:
25198 case RecurKind::AnyOf:
25199 case RecurKind::FindFirstIVSMin:
25200 case RecurKind::FindFirstIVUMin:
25201 case RecurKind::FindLastIVSMax:
25202 case RecurKind::FindLastIVUMax:
25203 case RecurKind::FMaxNum:
25204 case RecurKind::FMinNum:
25205 case RecurKind::FMaximumNum:
25206 case RecurKind::FMinimumNum:
25207 case RecurKind::None:
25217 return HorizontalReduction::getRdxKind(V);
25223 unsigned AggregateSize = 1;
25225 Type *CurrentType =
IV->getType();
25228 for (
auto *Elt : ST->elements())
25229 if (Elt != ST->getElementType(0))
25230 return std::nullopt;
25231 AggregateSize *= ST->getNumElements();
25232 CurrentType = ST->getElementType(0);
25234 AggregateSize *= AT->getNumElements();
25235 CurrentType = AT->getElementType();
25237 AggregateSize *= VT->getNumElements();
25238 return AggregateSize;
25240 return AggregateSize;
25242 return std::nullopt;
25251 unsigned OperandOffset,
const BoUpSLP &R) {
25254 std::optional<unsigned> OperandIndex =
25256 if (!OperandIndex || R.isDeleted(LastInsertInst))
25260 BuildVectorOpds, InsertElts, *OperandIndex, R);
25263 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25264 InsertElts[*OperandIndex] = LastInsertInst;
25267 }
while (LastInsertInst !=
nullptr &&
25294 "Expected insertelement or insertvalue instruction!");
25297 "Expected empty result vectors!");
25300 if (!AggregateSize)
25302 BuildVectorOpds.
resize(*AggregateSize);
25303 InsertElts.
resize(*AggregateSize);
25308 if (BuildVectorOpds.
size() >= 2)
25326 auto DominatedReduxValue = [&](
Value *R) {
25334 if (
P->getIncomingBlock(0) == ParentBB) {
25336 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25340 if (Rdx && DominatedReduxValue(Rdx))
25353 if (
P->getIncomingBlock(0) == BBLatch) {
25355 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25359 if (Rdx && DominatedReduxValue(Rdx))
25395 "Expected binop, select, or intrinsic for reduction matching");
25397 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25399 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25410 Value *Op0 =
nullptr;
25411 Value *Op1 =
nullptr;
25420 Value *B0 =
nullptr, *B1 =
nullptr;
25425bool SLPVectorizerPass::vectorizeHorReduction(
25426 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
25427 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25436 auto SelectRoot = [&]() {
25455 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25456 Stack.emplace(SelectRoot(), 0);
25457 SmallPtrSet<Value *, 8> VisitedInstrs;
25460 if (
R.isAnalyzedReductionRoot(Inst))
25465 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25467 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25469 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25470 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25482 while (!
Stack.empty()) {
25485 std::tie(Inst, Level) =
Stack.front();
25490 if (
R.isDeleted(Inst))
25492 if (
Value *VectorizedV = TryToReduce(Inst)) {
25496 Stack.emplace(
I, Level);
25499 if (
R.isDeleted(Inst))
25503 if (!TryAppendToPostponedInsts(Inst)) {
25514 if (VisitedInstrs.
insert(
Op).second)
25519 !
R.isDeleted(
I) &&
I->getParent() == BB)
25520 Stack.emplace(
I, Level);
25525bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
25532 if ((
I->getOpcode() == Instruction::FAdd ||
25533 I->getOpcode() == Instruction::FSub) &&
25543 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25544 R.isDeleted(Op0) ||
R.isDeleted(Op1))
25554 if (
A &&
B &&
B->hasOneUse()) {
25557 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
25559 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
25563 if (
B &&
A &&
A->hasOneUse()) {
25566 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
25568 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
25572 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
25576 Type *Ty = Inst->getType();
25580 if (!HorRdx.matchReductionForOperands())
25586 TTI.getScalarizationOverhead(
25589 TTI.getInstructionCost(Inst,
CostKind);
25601 FMF = FPCI->getFastMathFlags();
25602 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25609 if (RedCost >= ScalarCost)
25612 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
25614 if (Candidates.
size() == 1)
25615 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
25618 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
25619 if (!BestCandidate)
25621 return (*BestCandidate == 0 &&
25622 TryToReduce(
I, {Candidates[*BestCandidate].first,
25623 Candidates[*BestCandidate].second})) ||
25624 tryToVectorizeList({Candidates[*BestCandidate].first,
25625 Candidates[*BestCandidate].second},
25629bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
25630 BasicBlock *BB,
BoUpSLP &R) {
25632 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
25633 Res |= tryToVectorize(PostponedInsts, R);
25640 for (
Value *V : Insts)
25642 Res |= tryToVectorize(Inst, R);
25646bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25649 if (!
R.canMapToVector(IVI->
getType()))
25652 SmallVector<Value *, 16> BuildVectorOpds;
25653 SmallVector<Value *, 16> BuildVectorInsts;
25657 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
25658 R.getORE()->emit([&]() {
25659 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
25660 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
25661 "trying reduction first.";
25665 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
25667 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25670bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25673 SmallVector<Value *, 16> BuildVectorInsts;
25674 SmallVector<Value *, 16> BuildVectorOpds;
25675 SmallVector<int>
Mask;
25681 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
25682 R.getORE()->emit([&]() {
25683 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
25684 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
25685 "trying reduction first.";
25689 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
25690 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25693template <
typename T>
25698 bool MaxVFOnly,
BoUpSLP &R) {
25711 if (!
I || R.isDeleted(
I)) {
25715 auto *SameTypeIt = IncIt;
25718 AreCompatible(VL, *SameTypeIt))) {
25721 if (
I && !R.isDeleted(
I))
25726 unsigned NumElts = VL.
size();
25727 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
25728 << NumElts <<
")\n");
25738 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
25741 VL.
swap(Candidates);
25742 Candidates.
clear();
25750 auto GetMinNumElements = [&R](
Value *V) {
25751 unsigned EltSize = R.getVectorElementSize(V);
25752 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25754 if (NumElts < GetMinNumElements(*IncIt) &&
25755 (Candidates.
empty() ||
25756 Candidates.
front()->getType() == (*IncIt)->getType())) {
25764 if (Candidates.
size() > 1 &&
25765 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25766 if (TryToVectorizeHelper(Candidates,
false)) {
25769 }
else if (MaxVFOnly) {
25772 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
25775 if (!
I || R.isDeleted(
I)) {
25779 auto *SameTypeIt = It;
25780 while (SameTypeIt != End &&
25783 AreCompatible(*SameTypeIt, *It))) {
25786 if (
I && !R.isDeleted(
I))
25789 unsigned NumElts = VL.
size();
25790 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
25796 Candidates.
clear();
25800 IncIt = SameTypeIt;
25812template <
bool IsCompatibility>
25817 "Expected valid element types only.");
25819 return IsCompatibility;
25822 if (CI1->getOperand(0)->getType()->getTypeID() <
25824 return !IsCompatibility;
25825 if (CI1->getOperand(0)->getType()->getTypeID() >
25828 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25830 return !IsCompatibility;
25831 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25840 if (BasePred1 < BasePred2)
25841 return !IsCompatibility;
25842 if (BasePred1 > BasePred2)
25845 bool CI1Preds = Pred1 == BasePred1;
25846 bool CI2Preds = Pred2 == BasePred1;
25847 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
25848 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
25853 return !IsCompatibility;
25858 if (IsCompatibility) {
25859 if (I1->getParent() != I2->getParent())
25866 return NodeI2 !=
nullptr;
25869 assert((NodeI1 == NodeI2) ==
25871 "Different nodes should have different DFS numbers");
25872 if (NodeI1 != NodeI2)
25876 if (S && (IsCompatibility || !S.isAltShuffle()))
25878 if (IsCompatibility)
25880 if (I1->getOpcode() != I2->getOpcode())
25881 return I1->getOpcode() < I2->getOpcode();
25884 return IsCompatibility;
25887template <
typename ItT>
25889 BasicBlock *BB,
BoUpSLP &R) {
25892 for (CmpInst *
I : CmpInsts) {
25893 if (
R.isDeleted(
I))
25897 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
25898 if (
R.isDeleted(
I))
25903 for (CmpInst *
I : CmpInsts) {
25904 if (
R.isDeleted(
I))
25923 for (Instruction *V : CmpInsts)
25926 if (Vals.
size() <= 1)
25929 Vals, CompareSorter, AreCompatibleCompares,
25932 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
25933 return any_of(
V->users(), [V](User *U) {
25934 auto *Select = dyn_cast<SelectInst>(U);
25936 Select->getParent() != cast<Instruction>(V)->getParent();
25939 if (ArePossiblyReducedInOtherBlock)
25941 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25947bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25948 BasicBlock *BB,
BoUpSLP &R) {
25950 "This function only accepts Insert instructions");
25951 bool OpsChanged =
false;
25953 for (
auto *
I :
reverse(Instructions)) {
25959 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
25962 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
25965 if (
R.isDeleted(
I))
25967 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
25973 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
25975 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
25980 OpsChanged |= tryToVectorize(PostponedInsts, R);
25986bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
25989 SmallPtrSet<Value *, 16> VisitedInstrs;
25993 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
25994 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
25997 "Expected vectorizable types only.");
26007 V2->getType()->getScalarSizeInBits())
26010 V2->getType()->getScalarSizeInBits())
26014 if (Opcodes1.
size() < Opcodes2.
size())
26016 if (Opcodes1.
size() > Opcodes2.
size())
26018 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26027 return NodeI2 !=
nullptr;
26030 assert((NodeI1 == NodeI2) ==
26032 "Different nodes should have different DFS numbers");
26033 if (NodeI1 != NodeI2)
26036 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
26052 DT->getNode(V1->getParent());
26054 DT->getNode(V2->getParent());
26056 return NodeI2 !=
nullptr;
26059 assert((NodeI1 == NodeI2) ==
26061 "Different nodes should have different DFS numbers");
26062 if (NodeI1 != NodeI2)
26064 return V1->comesBefore(V2);
26077 return *Id1 < *Id2;
26081 if (
I1->getOpcode() == I2->getOpcode())
26083 return I1->getOpcode() < I2->getOpcode();
26106 auto ValID1 = Opcodes1[
I]->getValueID();
26107 auto ValID2 = Opcodes2[
I]->getValueID();
26108 if (ValID1 == ValID2)
26110 if (ValID1 < ValID2)
26112 if (ValID1 > ValID2)
26121 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
26127 if (VL.empty() || V1 == VL.back())
26129 Value *V2 = VL.back();
26134 if (Opcodes1.
size() != Opcodes2.
size())
26136 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26142 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
26144 if (
I1->getParent() != I2->getParent())
26152 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26158 bool HaveVectorizedPhiNodes =
false;
26162 for (Instruction &
I : *BB) {
26169 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26174 if (Incoming.
size() <= 1)
26179 for (
Value *V : Incoming) {
26180 SmallVectorImpl<Value *> &Opcodes =
26182 if (!Opcodes.
empty())
26185 SmallPtrSet<Value *, 4> Visited;
26186 while (!Nodes.empty()) {
26190 for (
Value *V :
PHI->incoming_values()) {
26192 Nodes.push_back(PHI1);
26201 Incoming, PHICompare, AreCompatiblePHIs,
26203 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26206 Changed |= HaveVectorizedPhiNodes;
26207 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26209 return !
PHI ||
R.isDeleted(
PHI);
26211 PHIToOpcodes.
clear();
26213 }
while (HaveVectorizedPhiNodes);
26215 VisitedInstrs.
clear();
26217 InstSetVector PostProcessInserts;
26218 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26221 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26222 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26223 if (VectorizeCmps) {
26225 PostProcessCmps.
clear();
26227 PostProcessInserts.clear();
26233 return PostProcessCmps.
contains(Cmp);
26235 PostProcessInserts.contains(
I);
26241 return I->use_empty() &&
26251 if (
R.isDeleted(&*It))
26254 if (!VisitedInstrs.
insert(&*It).second) {
26255 if (HasNoUsers(&*It) &&
26256 VectorizeInsertsAndCmps(It->isTerminator())) {
26269 if (
P->getNumIncomingValues() == 2) {
26272 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26286 if (BB ==
P->getIncomingBlock(
I) ||
26287 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
26293 PI && !IsInPostProcessInstrs(PI)) {
26295 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26297 if (Res &&
R.isDeleted(
P)) {
26307 if (HasNoUsers(&*It)) {
26308 bool OpsChanged =
false;
26319 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26320 SI->getValueOperand()->hasOneUse();
26322 if (TryToVectorizeRoot) {
26323 for (
auto *V : It->operand_values()) {
26327 VI && !IsInPostProcessInstrs(VI))
26329 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26336 VectorizeInsertsAndCmps(It->isTerminator());
26348 PostProcessInserts.insert(&*It);
26356bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
26358 for (
auto &Entry : GEPs) {
26361 if (
Entry.second.size() < 2)
26364 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26365 <<
Entry.second.size() <<
".\n");
26373 return !R.isDeleted(GEP);
26375 if (It ==
Entry.second.end())
26377 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26378 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26379 if (MaxVecRegSize < EltSize)
26382 unsigned MaxElts = MaxVecRegSize / EltSize;
26383 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26384 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26397 Candidates.remove_if([&R](
Value *
I) {
26407 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
26408 auto *GEPI = GEPList[
I];
26409 if (!Candidates.count(GEPI))
26411 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
26412 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
26413 auto *GEPJ = GEPList[J];
26414 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26416 Candidates.remove(GEPI);
26417 Candidates.remove(GEPJ);
26418 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26419 Candidates.remove(GEPJ);
26426 if (Candidates.
size() < 2)
26432 SmallVector<Value *, 16> Bundle(Candidates.
size());
26433 auto BundleIndex = 0
u;
26434 for (
auto *V : Candidates) {
26436 auto *GEPIdx =
GEP->idx_begin()->get();
26438 Bundle[BundleIndex++] = GEPIdx;
26450 Changed |= tryToVectorizeList(Bundle, R);
26456bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26461 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
26462 if (
V->getValueOperand()->getType()->getTypeID() <
26465 if (
V->getValueOperand()->getType()->getTypeID() >
26468 if (
V->getPointerOperandType()->getTypeID() <
26469 V2->getPointerOperandType()->getTypeID())
26471 if (
V->getPointerOperandType()->getTypeID() >
26472 V2->getPointerOperandType()->getTypeID())
26474 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26477 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26483 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26484 DT->getNode(
I1->getParent());
26485 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26486 DT->getNode(I2->getParent());
26487 assert(NodeI1 &&
"Should only process reachable instructions");
26488 assert(NodeI2 &&
"Should only process reachable instructions");
26489 assert((NodeI1 == NodeI2) ==
26491 "Different nodes should have different DFS numbers");
26492 if (NodeI1 != NodeI2)
26494 return I1->getOpcode() < I2->getOpcode();
26496 return V->getValueOperand()->getValueID() <
26500 bool SameParent =
true;
26506 StoreInst *V2 = VL.
back();
26531 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
26533 for (
auto [SI, V] :
zip(VL, NewVL))
26534 V =
SI->getValueOperand();
26535 NewVL.back() = V1->getValueOperand();
26536 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
26537 InstructionsState S =
Analysis.buildInstructionsState(
26545 return V1->getValueOperand()->
getValueID() ==
26550 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26551 for (
auto &Pair : Stores) {
26552 if (Pair.second.size() < 2)
26556 << Pair.second.size() <<
".\n");
26565 Pair.second.rend());
26567 ReversedStores, StoreSorter, AreCompatibleStores,
26569 return vectorizeStores(Candidates, R, Attempted);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
~ShuffleInstructionBuilder()
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(unsigned CounterName)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isStridedLoad(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, const int64_t Diff, StridedPtrInfo &SPtrInfo) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const