74#ifdef EXPENSIVE_CHECKS
108using namespace std::placeholders;
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
113STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116 "Controls which SLP graphs should be vectorized.");
120 cl::desc(
"Run the SLP vectorization passes"));
124 cl::desc(
"Enable vectorization for wider vector utilization"));
128 cl::desc(
"Only vectorize if you gain more than this "
133 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
163 cl::desc(
"Limit the size of the SLP scheduling region per block"));
167 cl::desc(
"Attempt to vectorize for this register size in bits"));
171 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
175 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
181 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
190 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
194 cl::desc(
"The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
199 cl::desc(
"The maximum stride, considered to be profitable."));
203 cl::desc(
"Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
208 cl::desc(
"Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
213 cl::desc(
"Display the SLP trees with Graphviz"));
217 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
222 cl::desc(
"Try to replace values with the idempotent instructions for "
223 "better vectorization."));
255 Ty = Ty->getScalarType();
257 !Ty->isPPC_FP128Ty();
266 return SI->getValueOperand()->getType();
268 return CI->getOperand(0)->getType();
270 return IE->getOperand(1)->getType();
277 "ScalableVectorType is not supported.");
279 return VecTy->getNumElements();
293 Type *Ty,
unsigned Sz) {
298 if (NumParts == 0 || NumParts >= Sz)
313 if (NumParts == 0 || NumParts >= Sz)
318 return (Sz / RegVF) * RegVF;
330 I * VecTyNumElements, VecTyNumElements)))
332 : Mask[
I] * VecTyNumElements + J;
366 unsigned SVNumElements =
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
374 unsigned NumGroup = 0;
375 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
377 Value *Src = SV->getOperand(0);
383 if (SV->getOperand(0) != Src)
386 if (!SV->isExtractSubvectorMask(Index))
388 ExpectedIndex.
set(Index / ShuffleMaskSize);
392 if (!ExpectedIndex.
all())
396 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
415 unsigned SVNumElements =
418 unsigned AccumulateLength = 0;
419 for (
Value *V : VL) {
421 for (
int M : SV->getShuffleMask())
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
465 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
474 OS <<
"Idx: " << Idx <<
", ";
475 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
498 if (BB !=
II->getParent())
515 Value *FirstNonUndef =
nullptr;
516 for (
Value *V : VL) {
519 if (!FirstNonUndef) {
523 if (V != FirstNonUndef)
526 return FirstNonUndef !=
nullptr;
542 return Cmp->isCommutative();
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
564 (BO->getOpcode() == Instruction::FSub &&
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
570 return I->isCommutative();
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
593 return I->getNumOperands();
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
610 if (CI->getValue().uge(VT->getNumElements()))
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
635 Type *CurrentType =
IV->getType();
636 for (
unsigned I :
IV->indices()) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(
I);
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
664 return std::all_of(It, VL.
end(), [&](
Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
697 if (MaskArg == UseMask::UndefsAsMask)
701 if (MaskArg == UseMask::FirstArg &&
Value < VF)
702 UseMask.reset(
Value);
703 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
704 UseMask.reset(
Value - VF);
712template <
bool IsPoisonOnly = false>
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
724 if (!UseMask.empty()) {
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
750 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
751 if (
Constant *Elem =
C->getAggregateElement(
I))
753 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
781static std::optional<TargetTransformInfo::ShuffleKind>
788 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
795 return std::max(S, VTy->getNumElements());
798 Value *Vec1 =
nullptr;
799 Value *Vec2 =
nullptr;
804 Value *Vec = EE->getVectorOperand();
810 ShuffleMode CommonShuffleMode =
Unknown;
812 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
819 auto *Vec = EI->getVectorOperand();
833 if (Idx->getValue().uge(
Size))
835 unsigned IntIdx = Idx->getValue().getZExtValue();
842 if (!Vec1 || Vec1 == Vec) {
844 }
else if (!Vec2 || Vec2 == Vec) {
850 if (CommonShuffleMode == Permute)
854 if (Mask[
I] %
Size !=
I) {
855 CommonShuffleMode = Permute;
858 CommonShuffleMode =
Select;
861 if (CommonShuffleMode ==
Select && Vec2)
871 unsigned Opcode =
E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
879 return CI->getZExtValue();
882 if (EI->getNumIndices() != 1)
884 return *EI->idx_begin();
910bool isValidForAlternation(
unsigned Opcode) {
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
934 MainOpBIT = 0b100000000,
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(
const Instruction *
I) {
944 unsigned Opcode =
I->getOpcode();
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
957 struct InterchangeableInfo {
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(
const Instruction *I) : I(I) {}
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
979 bool equal(
unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1006 bool hasCandidateOpcode(
unsigned Opcode)
const {
1007 MaskType Candidate = Mask & SeenBefore;
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1058 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1061 : APInt::getZero(FromCIValueBitWidth);
1064 case Instruction::Mul:
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1069 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1072 : APInt::getZero(FromCIValueBitWidth);
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.
isZero()) {
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1086 case Instruction::And:
1088 ToCIValue = ToOpcode == Instruction::Mul
1090 : APInt::getZero(FromCIValueBitWidth);
1093 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1097 Value *
LHS = I->getOperand(1 - Pos);
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1103 (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
1108 InterchangeableInfo MainOp;
1109 InterchangeableInfo AltOp;
1110 bool isValidForAlternation(
const Instruction *
I)
const {
1111 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1112 ::isValidForAlternation(
I->getOpcode());
1114 bool initializeAltOp(
const Instruction *
I) {
1117 if (!isValidForAlternation(
I))
1124 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1125 const Instruction *AltOp =
nullptr)
1126 : MainOp(MainOp), AltOp(AltOp) {
1129 bool add(
const Instruction *
I) {
1131 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1132 unsigned Opcode =
I->getOpcode();
1133 MaskType OpcodeInMaskForm;
1136 case Instruction::Shl:
1137 OpcodeInMaskForm = ShlBIT;
1139 case Instruction::AShr:
1140 OpcodeInMaskForm = AShrBIT;
1142 case Instruction::Mul:
1143 OpcodeInMaskForm = MulBIT;
1145 case Instruction::Add:
1146 OpcodeInMaskForm = AddBIT;
1148 case Instruction::Sub:
1149 OpcodeInMaskForm = SubBIT;
1151 case Instruction::And:
1152 OpcodeInMaskForm = AndBIT;
1154 case Instruction::Or:
1155 OpcodeInMaskForm = OrBIT;
1157 case Instruction::Xor:
1158 OpcodeInMaskForm = XorBIT;
1161 return MainOp.equal(Opcode) ||
1162 (initializeAltOp(
I) && AltOp.equal(Opcode));
1164 MaskType InterchangeableMask = OpcodeInMaskForm;
1165 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1167 constexpr MaskType CanBeAll =
1168 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1169 const APInt &CIValue = CI->
getValue();
1171 case Instruction::Shl:
1173 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1175 case Instruction::Mul:
1176 if (CIValue.
isOne()) {
1177 InterchangeableMask = CanBeAll;
1181 InterchangeableMask = MulBIT | ShlBIT;
1183 case Instruction::Add:
1184 case Instruction::Sub:
1185 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1187 case Instruction::And:
1189 InterchangeableMask = CanBeAll;
1193 InterchangeableMask = CanBeAll;
1197 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1198 (initializeAltOp(
I) &&
1199 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1201 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1203 bool hasCandidateOpcode(
unsigned Opcode)
const {
1204 return MainOp.hasCandidateOpcode(Opcode);
1206 bool hasAltOp()
const {
return AltOp.I; }
1207 unsigned getAltOpcode()
const {
1208 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1211 return MainOp.getOperand(
I);
1216class InstructionsState {
1242 bool HasCopyables =
false;
1246 assert(valid() &&
"InstructionsState is invalid.");
1251 assert(valid() &&
"InstructionsState is invalid.");
1256 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1258 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1261 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1270 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1271 assert(MainOp &&
"MainOp cannot be nullptr.");
1272 if (
I->getOpcode() == MainOp->getOpcode())
1275 assert(AltOp &&
"AltOp cannot be nullptr.");
1276 if (
I->getOpcode() == AltOp->getOpcode())
1278 if (!
I->isBinaryOp())
1280 BinOpSameOpcodeHelper
Converter(MainOp);
1283 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1284 BinOpSameOpcodeHelper AltConverter(AltOp);
1285 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1286 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1289 if (
Converter.hasAltOp() && !isAltShuffle())
1291 return Converter.hasAltOp() ? AltOp : MainOp;
1295 bool isShiftOp()
const {
1296 return getMainOp()->isShift() && getAltOp()->isShift();
1301 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1305 bool isMulDivLikeOp()
const {
1306 constexpr std::array<unsigned, 8> MulDiv = {
1307 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1308 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1309 Instruction::URem, Instruction::FRem};
1315 bool isAddSubLikeOp()
const {
1316 constexpr std::array<unsigned, 4>
AddSub = {
1317 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1324 bool isCmpOp()
const {
1325 return (
getOpcode() == Instruction::ICmp ||
1331 bool valid()
const {
return MainOp && AltOp; }
1333 explicit operator bool()
const {
return valid(); }
1335 InstructionsState() =
delete;
1336 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1337 bool HasCopyables =
false)
1338 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1339 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1342 bool isCopyableElement(
Value *V)
const {
1343 assert(valid() &&
"InstructionsState is invalid.");
1346 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1351 if (
I->getParent() != MainOp->getParent() &&
1355 if (
I->getOpcode() == MainOp->getOpcode())
1357 if (!
I->isBinaryOp())
1359 BinOpSameOpcodeHelper
Converter(MainOp);
1365 bool isNonSchedulable(
Value *V)
const {
1366 assert(valid() &&
"InstructionsState is invalid.");
1373 if (getMainOp() == V)
1375 if (isCopyableElement(V)) {
1376 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1378 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1383 !MainOp->comesBefore(
I));
1386 return IsNonSchedulableCopyableElement(V);
1393 bool areInstructionsWithCopyableElements()
const {
1394 assert(valid() &&
"InstructionsState is invalid.");
1395 return HasCopyables;
1399std::pair<Instruction *, SmallVector<Value *>>
1401 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1402 assert(SelectedOp &&
"Cannot convert the instruction.");
1403 if (
I->isBinaryOp()) {
1405 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1424 for (
Value *V : VL) {
1429 if (Inst->getOpcode() == Opcode)
1443 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1454 "Assessing comparisons of different types?");
1464 return (BasePred == Pred &&
1466 (BasePred == SwappedPred &&
1477 return InstructionsState::invalid();
1481 return InstructionsState::invalid();
1486 (VL.
size() == 2 && InstCnt < 2))
1487 return InstructionsState::invalid();
1496 unsigned AltOpcode = Opcode;
1498 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1499 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1501 UniquePreds.
insert(BasePred);
1502 UniqueNonSwappedPreds.
insert(BasePred);
1503 for (
Value *V : VL) {
1510 UniqueNonSwappedPreds.
insert(CurrentPred);
1511 if (!UniquePreds.
contains(CurrentPred) &&
1512 !UniquePreds.
contains(SwappedCurrentPred))
1513 UniquePreds.
insert(CurrentPred);
1518 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1528 return InstructionsState::invalid();
1530 bool AnyPoison = InstCnt != VL.
size();
1541 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1542 return InstructionsState::invalid();
1543 unsigned InstOpcode =
I->getOpcode();
1545 if (BinOpHelper.add(
I))
1550 Value *Op1 =
I->getOperand(0);
1553 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1555 if (Opcode == AltOpcode) {
1556 assert(isValidForAlternation(Opcode) &&
1557 isValidForAlternation(InstOpcode) &&
1558 "Cast isn't safe for alternation, logic needs to be updated!");
1559 AltOpcode = InstOpcode;
1566 Type *Ty0 = BaseInst->getOperand(0)->getType();
1567 Type *Ty1 = Inst->getOperand(0)->getType();
1569 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1570 assert(InstOpcode == AltOpcode &&
1571 "Alternate instructions are only supported by BinaryOperator "
1579 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1580 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1586 if (MainOp != AltOp) {
1589 }
else if (BasePred != CurrentPred) {
1591 isValidForAlternation(InstOpcode) &&
1592 "CmpInst isn't safe for alternation, logic needs to be updated!");
1597 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1598 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1601 }
else if (InstOpcode == Opcode) {
1602 assert(InstOpcode == AltOpcode &&
1603 "Alternate instructions are only supported by BinaryOperator and "
1606 if (Gep->getNumOperands() != 2 ||
1608 return InstructionsState::invalid();
1611 return InstructionsState::invalid();
1614 if (!LI->isSimple() || !BaseLI->isSimple())
1615 return InstructionsState::invalid();
1619 return InstructionsState::invalid();
1620 if (
Call->hasOperandBundles() &&
1622 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1623 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1626 return InstructionsState::invalid();
1629 return InstructionsState::invalid();
1632 if (Mappings.
size() != BaseMappings.
size() ||
1633 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1634 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1635 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1636 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1637 Mappings.
front().Shape.Parameters !=
1638 BaseMappings.
front().Shape.Parameters)
1639 return InstructionsState::invalid();
1644 return InstructionsState::invalid();
1649 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1651 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1654 "Incorrect implementation of allSameOpcode.");
1655 InstructionsState S(MainOp, AltOp);
1661 "Invalid InstructionsState.");
1669 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1679 unsigned Opcode = UserInst->
getOpcode();
1681 case Instruction::Load: {
1685 case Instruction::Store: {
1687 return (
SI->getPointerOperand() == Scalar);
1689 case Instruction::Call: {
1693 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1694 Arg.value().get() == Scalar;
1714 return LI->isSimple();
1716 return SI->isSimple();
1718 return !
MI->isVolatile();
1726 bool ExtendingManyInputs =
false) {
1727 if (SubMask.
empty())
1730 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1733 "SubMask with many inputs support must be larger than the mask.");
1735 Mask.append(SubMask.
begin(), SubMask.
end());
1739 int TermValue = std::min(Mask.size(), SubMask.
size());
1740 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1742 (!ExtendingManyInputs &&
1743 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1745 NewMask[
I] = Mask[SubMask[
I]];
1761 const size_t Sz = Order.
size();
1764 for (
unsigned I = 0;
I < Sz; ++
I) {
1766 UnusedIndices.
reset(Order[
I]);
1768 MaskedIndices.
set(
I);
1770 if (MaskedIndices.
none())
1773 "Non-synced masked/available indices.");
1777 assert(Idx >= 0 &&
"Indices must be synced.");
1787 unsigned Opcode0,
unsigned Opcode1) {
1794 OpcodeMask.
set(Lane * ScalarTyNumElements,
1795 Lane * ScalarTyNumElements + ScalarTyNumElements);
1804 "Expected scalar constants.");
1807 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1816 const unsigned E = Indices.
size();
1818 for (
unsigned I = 0;
I < E; ++
I)
1819 Mask[Indices[
I]] =
I;
1825 assert(!Mask.empty() &&
"Expected non-empty mask.");
1829 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1831 Scalars[Mask[
I]] = Prev[
I];
1844 auto *IO = dyn_cast<Instruction>(V);
1847 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1860 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1862 auto *IU = dyn_cast<Instruction>(U);
1865 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1881 return !VL.
empty() &&
1897 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1906 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1907 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1908 if (NumParts == 0 || NumParts >= Limit)
1911 if (NumParts >= Sz || Sz % NumParts != 0 ||
1922 class ScheduleEntity;
1924 class ScheduleCopyableData;
1925 class ScheduleBundle;
1935 struct StridedPtrInfo {
1936 Value *StrideVal =
nullptr;
1937 const SCEV *StrideSCEV =
nullptr;
1963 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1964 AC(AC), DB(DB), DL(DL), ORE(ORE),
1983 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1996 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2017 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2018 return VectorizableTree.front()->Scalars;
2024 const TreeEntry &Root = *VectorizableTree.front();
2025 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2026 !Root.Scalars.
front()->getType()->isIntegerTy())
2027 return std::nullopt;
2028 auto It = MinBWs.find(&Root);
2029 if (It != MinBWs.end())
2033 if (Root.getOpcode() == Instruction::ZExt ||
2034 Root.getOpcode() == Instruction::SExt)
2035 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2036 Root.getOpcode() == Instruction::SExt);
2037 return std::nullopt;
2043 return MinBWs.at(VectorizableTree.front().get()).second;
2048 if (ReductionBitWidth == 0 ||
2049 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2050 ReductionBitWidth >=
2051 DL->getTypeSizeInBits(
2052 VectorizableTree.front()->Scalars.front()->getType()))
2054 VectorizableTree.front()->Scalars.front()->getType(),
2055 VectorizableTree.front()->getVectorFactor());
2058 VectorizableTree.front()->Scalars.front()->getContext(),
2060 VectorizableTree.front()->getVectorFactor());
2075 VectorizableTree.clear();
2076 ScalarToTreeEntries.clear();
2077 OperandsToTreeEntry.clear();
2078 ScalarsInSplitNodes.clear();
2080 NonScheduledFirst.clear();
2081 EntryToLastInstruction.clear();
2082 LoadEntriesToVectorize.clear();
2083 IsGraphTransformMode =
false;
2084 GatheredLoadsEntriesFirst.reset();
2085 CompressEntryToData.clear();
2086 ExternalUses.clear();
2087 ExternalUsesAsOriginalScalar.clear();
2088 ExternalUsesWithNonUsers.clear();
2089 for (
auto &Iter : BlocksSchedules) {
2090 BlockScheduling *BS = Iter.second.get();
2094 ReductionBitWidth = 0;
2096 CastMaxMinBWSizes.reset();
2097 ExtraBitWidthNodes.clear();
2098 InstrElementSize.clear();
2099 UserIgnoreList =
nullptr;
2100 PostponedGathers.clear();
2101 ValueToGatherNodes.clear();
2117 assert(!Order.
empty() &&
"expected non-empty order");
2118 const unsigned Sz = Order.
size();
2120 return P.value() ==
P.index() ||
P.value() == Sz;
2133 bool IgnoreReorder);
2146 std::optional<OrdersType>
2184 return MaxVecRegSize;
2189 return MinVecRegSize;
2197 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2198 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2199 return MaxVF ? MaxVF : UINT_MAX;
2240 const int64_t Diff, StridedPtrInfo &SPtrInfo)
const;
2255 StridedPtrInfo &SPtrInfo,
2256 unsigned *BestVF =
nullptr,
2257 bool TryRecursiveCheck =
true)
const;
2261 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2265 template <
typename T>
2267 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2292 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2293 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2318 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2319 MaxLevel(MaxLevel) {}
2375 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2380 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2382 return U == U1 || U == U2 || R.isVectorized(U);
2385 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2388 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2390 ((
int)V1->getNumUses() == NumLanes ||
2391 AllUsersAreInternal(V1, V2)))
2397 auto CheckSameEntryOrFail = [&]() {
2402 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2411 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2413 return CheckSameEntryOrFail();
2416 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2417 LI2->getPointerOperand(), DL, SE,
true);
2418 if (!Dist || *Dist == 0) {
2421 R.TTI->isLegalMaskedGather(
2424 return CheckSameEntryOrFail();
2428 if (std::abs(*Dist) > NumLanes / 2)
2461 Value *EV2 =
nullptr;
2474 int Dist = Idx2 - Idx1;
2477 if (std::abs(Dist) == 0)
2479 if (std::abs(Dist) > NumLanes / 2)
2486 return CheckSameEntryOrFail();
2492 if (I1->getParent() != I2->getParent())
2493 return CheckSameEntryOrFail();
2501 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2502 !S.isAltShuffle()) &&
2506 S.getMainOp()->getNumOperands();
2518 return CheckSameEntryOrFail();
2552 int ShallowScoreAtThisLevel =
2563 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2566 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2568 ShallowScoreAtThisLevel))
2569 return ShallowScoreAtThisLevel;
2570 assert(I1 && I2 &&
"Should have early exited.");
2577 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2578 OpIdx1 != NumOperands1; ++OpIdx1) {
2580 int MaxTmpScore = 0;
2581 unsigned MaxOpIdx2 = 0;
2582 bool FoundBest =
false;
2586 ? I2->getNumOperands()
2587 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2588 assert(FromIdx <= ToIdx &&
"Bad index");
2589 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2591 if (Op2Used.
count(OpIdx2))
2596 I1, I2, CurrLevel + 1, {});
2599 TmpScore > MaxTmpScore) {
2600 MaxTmpScore = TmpScore;
2607 Op2Used.
insert(MaxOpIdx2);
2608 ShallowScoreAtThisLevel += MaxTmpScore;
2611 return ShallowScoreAtThisLevel;
2642 struct OperandData {
2643 OperandData() =
default;
2644 OperandData(
Value *V,
bool APO,
bool IsUsed)
2645 : V(V), APO(APO), IsUsed(IsUsed) {}
2655 bool IsUsed =
false;
2664 enum class ReorderingMode {
2678 unsigned ArgSize = 0;
2684 const Loop *L =
nullptr;
2687 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2688 return OpsVec[
OpIdx][Lane];
2692 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2693 return OpsVec[
OpIdx][Lane];
2698 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2700 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2702 OpsVec[
OpIdx][Lane].IsUsed =
false;
2706 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2707 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2719 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2721 Value *IdxLaneV = getData(Idx, Lane).V;
2734 unsigned UniquesCount = Uniques.
size();
2735 auto IdxIt = Uniques.
find(IdxLaneV);
2736 unsigned UniquesCntWithIdxLaneV =
2737 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2739 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2740 unsigned UniquesCntWithOpIdxLaneV =
2741 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2742 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2744 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2745 UniquesCntWithOpIdxLaneV,
2746 UniquesCntWithOpIdxLaneV -
2748 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2749 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2750 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2759 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2760 Value *IdxLaneV = getData(Idx, Lane).V;
2773 return R.areAllUsersVectorized(IdxLaneI)
2781 static const int ScoreScaleFactor = 10;
2789 int Lane,
unsigned OpIdx,
unsigned Idx,
2799 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2800 if (Score <= -SplatScore) {
2804 Score += SplatScore;
2810 Score *= ScoreScaleFactor;
2811 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2829 std::optional<unsigned>
2830 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2834 unsigned NumOperands = getNumOperands();
2837 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2840 ReorderingMode RMode = ReorderingModes[
OpIdx];
2841 if (RMode == ReorderingMode::Failed)
2842 return std::nullopt;
2845 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2851 std::optional<unsigned> Idx;
2855 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2861 bool IsUsed = RMode == ReorderingMode::Splat ||
2862 RMode == ReorderingMode::Constant ||
2863 RMode == ReorderingMode::Load;
2865 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2867 OperandData &OpData = getData(Idx, Lane);
2869 bool OpAPO = OpData.APO;
2878 if (OpAPO != OpIdxAPO)
2883 case ReorderingMode::Load:
2884 case ReorderingMode::Opcode: {
2885 bool LeftToRight = Lane > LastLane;
2886 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2887 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2888 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2889 OpIdx, Idx, IsUsed, UsedLanes);
2890 if (Score >
static_cast<int>(BestOp.Score) ||
2891 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2894 BestOp.Score = Score;
2895 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2899 case ReorderingMode::Constant:
2901 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2905 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2912 case ReorderingMode::Splat:
2914 IsUsed =
Op == OpLastLane;
2915 if (
Op == OpLastLane) {
2917 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2923 case ReorderingMode::Failed:
2929 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2933 return std::nullopt;
2940 unsigned getBestLaneToStartReordering()
const {
2941 unsigned Min = UINT_MAX;
2942 unsigned SameOpNumber = 0;
2953 for (
int I = getNumLanes();
I > 0; --
I) {
2954 unsigned Lane =
I - 1;
2955 OperandsOrderData NumFreeOpsHash =
2956 getMaxNumOperandsThatCanBeReordered(Lane);
2959 if (NumFreeOpsHash.NumOfAPOs < Min) {
2960 Min = NumFreeOpsHash.NumOfAPOs;
2961 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2963 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2964 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2965 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2968 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2969 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2970 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2971 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2972 auto [It, Inserted] =
2973 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2979 unsigned BestLane = 0;
2980 unsigned CntMin = UINT_MAX;
2982 if (
Data.second.first < CntMin) {
2983 CntMin =
Data.second.first;
2984 BestLane =
Data.second.second;
2991 struct OperandsOrderData {
2994 unsigned NumOfAPOs = UINT_MAX;
2997 unsigned NumOpsWithSameOpcodeParent = 0;
3011 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3012 unsigned CntTrue = 0;
3013 unsigned NumOperands = getNumOperands();
3023 bool AllUndefs =
true;
3024 unsigned NumOpsWithSameOpcodeParent = 0;
3029 const OperandData &OpData = getData(
OpIdx, Lane);
3036 I->getParent() != Parent) {
3037 if (NumOpsWithSameOpcodeParent == 0) {
3038 NumOpsWithSameOpcodeParent = 1;
3040 Parent =
I->getParent();
3042 --NumOpsWithSameOpcodeParent;
3045 ++NumOpsWithSameOpcodeParent;
3054 OperandsOrderData
Data;
3055 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3056 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3063 const InstructionsState &S) {
3067 return VL.
size() == getNumLanes();
3069 "Expected same number of lanes");
3070 assert(S.valid() &&
"InstructionsState is invalid.");
3076 OpsVec.resize(ArgSize);
3077 unsigned NumLanes = VL.
size();
3078 for (OperandDataVec &
Ops : OpsVec)
3079 Ops.resize(NumLanes);
3097 bool IsInverseOperation =
false;
3098 if (S.isCopyableElement(VL[Lane])) {
3102 assert(
I &&
"Expected instruction");
3103 auto [SelectedOp,
Ops] = convertTo(
I, S);
3110 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3117 unsigned getNumOperands()
const {
return ArgSize; }
3120 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3123 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3124 return getData(
OpIdx, Lane).V;
3128 bool empty()
const {
return OpsVec.empty(); }
3131 void clear() { OpsVec.clear(); }
3136 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3138 "Op is expected to be getValue(OpIdx, Lane).");
3142 bool OpAPO = getData(
OpIdx, Lane).APO;
3143 bool IsInvariant = L && L->isLoopInvariant(
Op);
3145 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3149 bool FoundCandidate =
false;
3150 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3151 OperandData &
Data = getData(OpI, Ln);
3152 if (
Data.APO != OpAPO ||
Data.IsUsed)
3154 Value *OpILane = getValue(OpI, Lane);
3178 L->isLoopInvariant(
Data.V))) {
3179 FoundCandidate =
true;
3186 if (!FoundCandidate)
3189 return getNumLanes() == 2 || Cnt > 1;
3196 "Op is expected to be getValue(OpIdx, Lane).");
3197 bool OpAPO = getData(
OpIdx, Lane).APO;
3198 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3202 const OperandData &
Data = getData(OpI, Ln);
3203 if (
Data.APO != OpAPO ||
Data.IsUsed)
3205 Value *OpILn = getValue(OpI, Ln);
3206 return (L && L->isLoopInvariant(OpILn)) ||
3218 const InstructionsState &S,
const BoUpSLP &R)
3219 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3220 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3222 appendOperands(RootVL,
Operands, S);
3230 "Expected same num of lanes across all operands");
3231 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3232 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3240 unsigned NumOperands = getNumOperands();
3241 unsigned NumLanes = getNumLanes();
3261 unsigned FirstLane = getBestLaneToStartReordering();
3270 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3271 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3272 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3274 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3276 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3278 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3281 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3291 auto &&SkipReordering = [
this]() {
3294 for (
const OperandData &
Data : Op0)
3297 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3298 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3305 return UniqueValues.
size() != 2 &&
3307 UniqueValues.
size());
3319 if (SkipReordering())
3322 bool StrategyFailed =
false;
3330 for (
unsigned I = 0;
I < NumOperands; ++
I)
3331 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3334 UsedLanes.
set(FirstLane);
3335 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3337 for (
int Direction : {+1, -1}) {
3338 int Lane = FirstLane + Direction * Distance;
3339 if (Lane < 0 || Lane >= (
int)NumLanes)
3341 UsedLanes.
set(Lane);
3342 int LastLane = Lane - Direction;
3343 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3348 std::optional<unsigned> BestIdx =
3349 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3350 MainAltOps[
OpIdx], UsedLanes);
3357 swap(
OpIdx, *BestIdx, Lane);
3360 StrategyFailed =
true;
3364 OperandData &AltOp = getData(
OpIdx, Lane);
3365 InstructionsState OpS =
3367 if (OpS && OpS.isAltShuffle())
3374 if (!StrategyFailed)
3379#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3382 case ReorderingMode::Load:
3384 case ReorderingMode::Opcode:
3386 case ReorderingMode::Constant:
3388 case ReorderingMode::Splat:
3390 case ReorderingMode::Failed:
3411 const unsigned Indent = 2;
3413 for (
const OperandDataVec &OpDataVec : OpsVec) {
3414 OS <<
"Operand " << Cnt++ <<
"\n";
3415 for (
const OperandData &OpData : OpDataVec) {
3416 OS.
indent(Indent) <<
"{";
3417 if (
Value *V = OpData.V)
3421 OS <<
", APO:" << OpData.APO <<
"}\n";
3443 int BestScore = Limit;
3444 std::optional<int> Index;
3445 for (
int I :
seq<int>(0, Candidates.size())) {
3447 Candidates[
I].second,
3450 if (Score > BestScore) {
3465 DeletedInstructions.insert(
I);
3470 template <
typename T>
3473 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3475 for (
T *V : DeadVals) {
3480 for (
T *V : DeadVals) {
3481 if (!V || !Processed.
insert(V).second)
3486 for (
Use &U :
I->operands()) {
3488 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3490 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3491 return Entry->VectorizedValue == OpI;
3495 I->dropAllReferences();
3497 for (
T *V : DeadVals) {
3499 if (!
I->getParent())
3504 cast<Instruction>(U.getUser()));
3506 "trying to erase instruction with users.");
3507 I->removeFromParent();
3511 while (!DeadInsts.
empty()) {
3514 if (!VI || !VI->getParent())
3517 "Live instruction found in dead worklist!");
3518 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3525 for (
Use &OpU : VI->operands()) {
3526 Value *OpV = OpU.get();
3538 if (!DeletedInstructions.contains(OpI) &&
3539 (!OpI->getType()->isVectorTy() ||
3540 none_of(VectorValuesAndScales,
3541 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3542 return std::get<0>(V) == OpI;
3548 VI->removeFromParent();
3550 SE->forgetValue(VI);
3557 return AnalyzedReductionsRoots.count(
I);
3562 AnalyzedReductionsRoots.insert(
I);
3567 return AnalyzedReductionVals.contains(
hash_value(VL));
3572 AnalyzedReductionVals.insert(
hash_value(VL));
3576 AnalyzedReductionsRoots.clear();
3577 AnalyzedReductionVals.clear();
3578 AnalyzedMinBWVals.clear();
3586 return MustGather.contains(V);
3590 return NonScheduledFirst.contains(V);
3595 assert(V &&
"V cannot be nullptr.");
3596 return ScalarToTreeEntries.contains(V);
3606 bool collectValuesToDemote(
3607 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3610 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3619 void buildReorderableOperands(
3627 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3630 bool areAllUsersVectorized(
3639 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3640 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3641 return const_cast<TreeEntry *
>(
3642 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3648 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3652 getCastContextHint(
const TreeEntry &TE)
const;
3666 const InstructionsState &LocalState,
3673 unsigned InterleaveFactor = 0);
3684 bool ResizeAllowed =
false)
const;
3691 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx);
3696 template <
typename BVTy,
typename ResTy,
typename... Args>
3697 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3702 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
3708 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3715 std::optional<TargetTransformInfo::ShuffleKind>
3727 unsigned NumParts)
const;
3739 std::optional<TargetTransformInfo::ShuffleKind>
3740 isGatherShuffledSingleRegisterEntry(
3757 isGatherShuffledEntry(
3760 unsigned NumParts,
bool ForOrder =
false);
3766 Type *ScalarTy)
const;
3770 void setInsertPointAfterBundle(
const TreeEntry *E);
3780 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3785 void tryToVectorizeGatheredLoads(
3787 std::tuple<BasicBlock *, Value *, Type *>,
3795 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3811 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3815 void reorderGatherNode(TreeEntry &TE);
3820 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3824 if (State == TreeEntry::SplitVectorize)
3826 SmallVector<int>
Mask;
3833 SmallVector<int> getSplitMask()
const {
3834 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3835 "Expected only split vectorize node.");
3837 unsigned CommonVF = std::max<unsigned>(
3838 CombinedEntriesWithIndices.back().second,
3839 Scalars.size() - CombinedEntriesWithIndices.back().second);
3840 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3842 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3843 ? CommonVF - CombinedEntriesWithIndices.back().second
3850 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3851 ArrayRef<int> MaskOrder);
3856 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3857 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3860 [Scalars](
Value *V,
int Idx) {
3861 return (isa<UndefValue>(V) &&
3862 Idx == PoisonMaskElem) ||
3863 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3866 if (!ReorderIndices.empty()) {
3870 SmallVector<int>
Mask;
3872 if (VL.
size() == Scalars.size())
3873 return IsSame(Scalars, Mask);
3874 if (VL.
size() == ReuseShuffleIndices.size()) {
3876 return IsSame(Scalars, Mask);
3880 return IsSame(Scalars, ReuseShuffleIndices);
3884 bool hasEqualOperands(
const TreeEntry &TE)
const {
3885 if (
TE.getNumOperands() != getNumOperands())
3887 SmallBitVector
Used(getNumOperands());
3888 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3889 unsigned PrevCount =
Used.count();
3890 for (
unsigned K = 0;
K <
E; ++
K) {
3893 if (getOperand(K) ==
TE.getOperand(
I)) {
3899 if (PrevCount ==
Used.count())
3908 unsigned getVectorFactor()
const {
3909 if (!ReuseShuffleIndices.empty())
3910 return ReuseShuffleIndices.size();
3911 return Scalars.size();
3915 bool isGather()
const {
return State == NeedToGather; }
3921 WeakTrackingVH VectorizedValue =
nullptr;
3942 enum CombinedOpcode {
3944 MinMax = Instruction::OtherOpsEnd + 1,
3947 CombinedOpcode CombinedOp = NotCombinedOp;
3950 SmallVector<int, 4> ReuseShuffleIndices;
3953 SmallVector<unsigned, 4> ReorderIndices;
3961 VecTreeTy &Container;
3964 EdgeInfo UserTreeIndex;
3980 SmallPtrSet<const Value *, 4> CopyableElements;
3984 InstructionsState S = InstructionsState::invalid();
3987 unsigned InterleaveFactor = 0;
3990 bool DoesNotNeedToSchedule =
false;
3994 if (Operands.size() <
OpIdx + 1)
3995 Operands.resize(
OpIdx + 1);
3998 "Number of operands is greater than the number of scalars.");
4005 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4007 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4010 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4013 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4018 setOperand(
I, Operands[
I]);
4022 void reorderOperands(ArrayRef<int> Mask) {
4030 return Operands[
OpIdx];
4036 return Operands[
OpIdx];
4040 unsigned getNumOperands()
const {
return Operands.size(); }
4043 Value *getSingleOperand(
unsigned OpIdx)
const {
4046 return Operands[
OpIdx][0];
4050 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4052 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4053 return S.getMatchingMainOpOrAltOp(
I);
4061 if (
I && getMatchingMainOpOrAltOp(
I))
4063 return S.getMainOp();
4066 void setOperations(
const InstructionsState &S) {
4067 assert(S &&
"InstructionsState is invalid.");
4071 Instruction *getMainOp()
const {
return S.getMainOp(); }
4073 Instruction *getAltOp()
const {
return S.getAltOp(); }
4076 unsigned getOpcode()
const {
return S.getOpcode(); }
4078 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4080 bool hasState()
const {
return S.valid(); }
4083 void addCopyableElement(
Value *V) {
4084 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4085 CopyableElements.insert(V);
4089 bool isCopyableElement(
Value *V)
const {
4090 return CopyableElements.contains(V);
4094 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4097 const InstructionsState &getOperations()
const {
return S; }
4101 unsigned findLaneForValue(
Value *V)
const {
4102 unsigned FoundLane = getVectorFactor();
4103 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4104 std::advance(It, 1)) {
4107 FoundLane = std::distance(Scalars.begin(), It);
4108 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4109 if (!ReorderIndices.empty())
4110 FoundLane = ReorderIndices[FoundLane];
4111 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4112 if (ReuseShuffleIndices.empty())
4114 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4115 RIt != ReuseShuffleIndices.end()) {
4116 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4120 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4127 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4128 SmallVectorImpl<int> &Mask,
4129 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4130 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4133 bool isNonPowOf2Vec()
const {
4135 return IsNonPowerOf2;
4141 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4144 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4145 "Reshuffling not supported with non-power-of-2 vectors yet.");
4146 return IsNonPowerOf2;
4149 Value *getOrdered(
unsigned Idx)
const {
4150 assert(isGather() &&
"Must be used only for buildvectors/gathers.");
4151 if (ReorderIndices.empty())
4152 return Scalars[Idx];
4153 SmallVector<int>
Mask;
4155 return Scalars[
Mask[Idx]];
4161 dbgs() << Idx <<
".\n";
4162 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4163 dbgs() <<
"Operand " << OpI <<
":\n";
4164 for (
const Value *V : Operands[OpI])
4167 dbgs() <<
"Scalars: \n";
4168 for (
Value *V : Scalars)
4170 dbgs() <<
"State: ";
4171 if (S && hasCopyableElements())
4172 dbgs() <<
"[[Copyable]] ";
4175 if (InterleaveFactor > 0) {
4176 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4179 dbgs() <<
"Vectorize\n";
4182 case ScatterVectorize:
4183 dbgs() <<
"ScatterVectorize\n";
4185 case StridedVectorize:
4186 dbgs() <<
"StridedVectorize\n";
4188 case CompressVectorize:
4189 dbgs() <<
"CompressVectorize\n";
4192 dbgs() <<
"NeedToGather\n";
4194 case CombinedVectorize:
4195 dbgs() <<
"CombinedVectorize\n";
4197 case SplitVectorize:
4198 dbgs() <<
"SplitVectorize\n";
4202 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4203 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4205 dbgs() <<
"MainOp: NULL\n";
4206 dbgs() <<
"AltOp: NULL\n";
4208 dbgs() <<
"VectorizedValue: ";
4209 if (VectorizedValue)
4210 dbgs() << *VectorizedValue <<
"\n";
4213 dbgs() <<
"ReuseShuffleIndices: ";
4214 if (ReuseShuffleIndices.empty())
4217 for (
int ReuseIdx : ReuseShuffleIndices)
4218 dbgs() << ReuseIdx <<
", ";
4220 dbgs() <<
"ReorderIndices: ";
4221 for (
unsigned ReorderIdx : ReorderIndices)
4222 dbgs() << ReorderIdx <<
", ";
4224 dbgs() <<
"UserTreeIndex: ";
4226 dbgs() << UserTreeIndex;
4228 dbgs() <<
"<invalid>";
4230 if (!CombinedEntriesWithIndices.empty()) {
4231 dbgs() <<
"Combined entries: ";
4233 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4244 StringRef Banner)
const {
4245 dbgs() <<
"SLP: " << Banner <<
":\n";
4247 dbgs() <<
"SLP: Costs:\n";
4248 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4249 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4250 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4251 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4252 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4258 const InstructionsState &S,
4260 ArrayRef<int> ReuseShuffleIndices = {}) {
4261 auto Invalid = ScheduleBundle::invalid();
4262 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4267 const InstructionsState &S,
4269 ArrayRef<int> ReuseShuffleIndices = {},
4270 ArrayRef<unsigned> ReorderIndices = {},
4271 unsigned InterleaveFactor = 0) {
4272 TreeEntry::EntryState EntryState =
4273 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4274 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4275 ReuseShuffleIndices, ReorderIndices);
4276 if (
E && InterleaveFactor > 0)
4277 E->setInterleave(InterleaveFactor);
4282 TreeEntry::EntryState EntryState,
4283 ScheduleBundle &Bundle,
const InstructionsState &S,
4285 ArrayRef<int> ReuseShuffleIndices = {},
4286 ArrayRef<unsigned> ReorderIndices = {}) {
4287 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4288 EntryState == TreeEntry::SplitVectorize)) ||
4289 (Bundle && EntryState != TreeEntry::NeedToGather &&
4290 EntryState != TreeEntry::SplitVectorize)) &&
4291 "Need to vectorize gather entry?");
4293 if (GatheredLoadsEntriesFirst.has_value() &&
4294 EntryState == TreeEntry::NeedToGather && S &&
4295 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4296 !UserTreeIdx.UserTE)
4298 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4299 TreeEntry *
Last = VectorizableTree.back().get();
4300 Last->Idx = VectorizableTree.size() - 1;
4301 Last->State = EntryState;
4302 if (UserTreeIdx.UserTE)
4303 OperandsToTreeEntry.try_emplace(
4304 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4309 ReuseShuffleIndices.empty()) &&
4310 "Reshuffling scalars not yet supported for nodes with padding");
4311 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4312 ReuseShuffleIndices.end());
4313 if (ReorderIndices.
empty()) {
4316 Last->setOperations(S);
4319 Last->Scalars.assign(VL.
size(),
nullptr);
4321 [VL](
unsigned Idx) ->
Value * {
4322 if (Idx >= VL.size())
4323 return UndefValue::get(VL.front()->getType());
4328 Last->setOperations(S);
4329 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4331 if (EntryState == TreeEntry::SplitVectorize) {
4332 assert(S &&
"Split nodes must have operations.");
4333 Last->setOperations(S);
4334 SmallPtrSet<Value *, 4> Processed;
4335 for (
Value *V : VL) {
4339 auto It = ScalarsInSplitNodes.find(V);
4340 if (It == ScalarsInSplitNodes.end()) {
4341 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4342 (void)Processed.
insert(V);
4343 }
else if (Processed.
insert(V).second) {
4345 "Value already associated with the node.");
4346 It->getSecond().push_back(
Last);
4349 }
else if (!
Last->isGather()) {
4352 (!S.areInstructionsWithCopyableElements() &&
4354 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4355 Last->setDoesNotNeedToSchedule();
4356 SmallPtrSet<Value *, 4> Processed;
4357 for (
Value *V : VL) {
4360 if (S.isCopyableElement(V)) {
4361 Last->addCopyableElement(V);
4364 auto It = ScalarToTreeEntries.find(V);
4365 if (It == ScalarToTreeEntries.end()) {
4366 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4367 (void)Processed.
insert(V);
4368 }
else if (Processed.
insert(V).second) {
4370 "Value already associated with the node.");
4371 It->getSecond().push_back(
Last);
4375 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4376 "Bundle and VL out of sync");
4377 if (!Bundle.getBundle().empty()) {
4378#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4379 auto *BundleMember = Bundle.getBundle().begin();
4380 SmallPtrSet<Value *, 4> Processed;
4381 for (
Value *V : VL) {
4382 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4386 assert(BundleMember == Bundle.getBundle().end() &&
4387 "Bundle and VL out of sync");
4389 Bundle.setTreeEntry(
Last);
4393 bool AllConstsOrCasts =
true;
4394 for (
Value *V : VL) {
4395 if (S && S.areInstructionsWithCopyableElements() &&
4396 S.isCopyableElement(V))
4397 Last->addCopyableElement(V);
4400 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4401 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4402 !UserTreeIdx.UserTE->isGather())
4403 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4406 if (AllConstsOrCasts)
4408 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4409 MustGather.insert_range(VL);
4412 if (UserTreeIdx.UserTE)
4413 Last->UserTreeIndex = UserTreeIdx;
4419 TreeEntry::VecTreeTy VectorizableTree;
4424 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4425 VectorizableTree[
Id]->dump();
4433 assert(V &&
"V cannot be nullptr.");
4434 auto It = ScalarToTreeEntries.find(V);
4435 if (It == ScalarToTreeEntries.end())
4437 return It->getSecond();
4442 assert(V &&
"V cannot be nullptr.");
4443 auto It = ScalarsInSplitNodes.find(V);
4444 if (It == ScalarsInSplitNodes.end())
4446 return It->getSecond();
4451 bool SameVF =
false)
const {
4452 assert(V &&
"V cannot be nullptr.");
4453 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4454 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4465 bool areAltOperandsProfitable(
const InstructionsState &S,
4470 class ScalarsVectorizationLegality {
4471 InstructionsState S;
4473 bool TryToFindDuplicates;
4474 bool TrySplitVectorize;
4477 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4478 bool TryToFindDuplicates =
true,
4479 bool TrySplitVectorize =
false)
4480 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4481 TrySplitVectorize(TrySplitVectorize) {
4482 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4483 "Inconsistent state");
4485 const InstructionsState &getInstructionsState()
const {
return S; };
4486 bool isLegal()
const {
return IsLegal; }
4487 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4488 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4493 ScalarsVectorizationLegality
4496 bool TryCopyableElementsVectorization)
const;
4500 TreeEntry::EntryState getScalarsVectorizationState(
4502 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4503 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4506 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4509 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4510 OperandsToTreeEntry;
4513 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4516 SmallDenseMap<Value *, unsigned> InstrElementSize;
4530 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4535 SetVector<const TreeEntry *> PostponedGathers;
4537 using ValueToGatherNodesMap =
4538 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4539 ValueToGatherNodesMap ValueToGatherNodes;
4544 SetVector<unsigned> LoadEntriesToVectorize;
4547 bool IsGraphTransformMode =
false;
4550 std::optional<unsigned> GatheredLoadsEntriesFirst;
4553 SmallDenseMap<
const TreeEntry *,
4554 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4555 CompressEntryToData;
4558 struct ExternalUser {
4559 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4560 : Scalar(S), User(
U), E(E), Lane(
L) {}
4563 Value *Scalar =
nullptr;
4566 llvm::User *User =
nullptr;
4574 using UserList = SmallVector<ExternalUser, 16>;
4580 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4581 Instruction *Inst2) {
4584 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4585 auto Res = AliasCache.try_emplace(
Key);
4587 return Res.first->second;
4588 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4590 Res.first->getSecond() = Aliased;
4594 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4598 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4603 BatchAAResults BatchAA;
4610 DenseSet<Instruction *> DeletedInstructions;
4613 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4616 DenseSet<size_t> AnalyzedReductionVals;
4620 DenseSet<Value *> AnalyzedMinBWVals;
4626 UserList ExternalUses;
4630 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4634 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4637 SmallPtrSet<const Value *, 32> EphValues;
4641 SetVector<Instruction *> GatherShuffleExtractSeq;
4644 DenseSet<BasicBlock *> CSEBlocks;
4647 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4654 class ScheduleEntity {
4655 friend class ScheduleBundle;
4656 friend class ScheduleData;
4657 friend class ScheduleCopyableData;
4660 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4661 Kind getKind()
const {
return K; }
4662 ScheduleEntity(Kind K) : K(K) {}
4666 int SchedulingPriority = 0;
4669 bool IsScheduled =
false;
4671 const Kind K = Kind::ScheduleData;
4674 ScheduleEntity() =
delete;
4676 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4677 int getSchedulingPriority()
const {
return SchedulingPriority; }
4678 bool isReady()
const {
4680 return SD->isReady();
4682 return CD->isReady();
4688 bool hasValidDependencies()
const {
4690 return SD->hasValidDependencies();
4692 return CD->hasValidDependencies();
4696 int getUnscheduledDeps()
const {
4698 return SD->getUnscheduledDeps();
4700 return CD->getUnscheduledDeps();
4704 int incrementUnscheduledDeps(
int Incr) {
4706 return SD->incrementUnscheduledDeps(Incr);
4710 int getDependencies()
const {
4712 return SD->getDependencies();
4718 return SD->getInst();
4723 bool isScheduled()
const {
return IsScheduled; }
4724 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4726 static bool classof(
const ScheduleEntity *) {
return true; }
4728#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4729 void dump(raw_ostream &OS)
const {
4731 return SD->dump(OS);
4733 return CD->dump(OS);
4744#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4746 const BoUpSLP::ScheduleEntity &SE) {
4756 class ScheduleData final :
public ScheduleEntity {
4760 enum { InvalidDeps = -1 };
4762 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4763 static bool classof(
const ScheduleEntity *Entity) {
4764 return Entity->getKind() == Kind::ScheduleData;
4767 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4768 NextLoadStore =
nullptr;
4769 IsScheduled =
false;
4770 SchedulingRegionID = BlockSchedulingRegionID;
4771 clearDependencies();
4777 if (hasValidDependencies()) {
4778 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4780 assert(UnscheduledDeps == Dependencies &&
"invariant");
4784 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4785 "unexpected scheduled state");
4792 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4796 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4801 int incrementUnscheduledDeps(
int Incr) {
4802 assert(hasValidDependencies() &&
4803 "increment of unscheduled deps would be meaningless");
4804 UnscheduledDeps += Incr;
4805 return UnscheduledDeps;
4810 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4813 void clearDependencies() {
4814 clearDirectDependencies();
4815 MemoryDependencies.clear();
4816 ControlDependencies.clear();
4823 void clearDirectDependencies() {
4824 Dependencies = InvalidDeps;
4825 resetUnscheduledDeps();
4826 IsScheduled =
false;
4830 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4832 int getDependencies()
const {
return Dependencies; }
4834 void initDependencies() { Dependencies = 0; }
4836 void incDependencies() { Dependencies++; }
4839 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4846 return MemoryDependencies;
4849 void addMemoryDependency(ScheduleData *Dep) {
4850 MemoryDependencies.push_back(Dep);
4854 return ControlDependencies;
4857 void addControlDependency(ScheduleData *Dep) {
4858 ControlDependencies.push_back(Dep);
4861 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4862 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4864 void dump(raw_ostream &OS)
const { OS << *Inst; }
4876 ScheduleData *NextLoadStore =
nullptr;
4880 SmallVector<ScheduleData *> MemoryDependencies;
4886 SmallVector<ScheduleData *> ControlDependencies;
4890 int SchedulingRegionID = 0;
4896 int Dependencies = InvalidDeps;
4902 int UnscheduledDeps = InvalidDeps;
4907 const BoUpSLP::ScheduleData &SD) {
4913 class ScheduleBundle final :
public ScheduleEntity {
4917 bool IsValid =
true;
4919 TreeEntry *TE =
nullptr;
4920 ScheduleBundle(
bool IsValid)
4921 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4924 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4925 static bool classof(
const ScheduleEntity *Entity) {
4926 return Entity->getKind() == Kind::ScheduleBundle;
4931 for (
const ScheduleEntity *SD : Bundle) {
4932 if (SD->hasValidDependencies()) {
4933 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4936 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4940 if (isScheduled()) {
4941 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4942 "unexpected scheduled state");
4948 int unscheduledDepsInBundle()
const {
4949 assert(*
this &&
"bundle must not be empty");
4951 for (
const ScheduleEntity *BundleMember : Bundle) {
4952 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4953 return ScheduleData::InvalidDeps;
4954 Sum += BundleMember->getUnscheduledDeps();
4962 bool hasValidDependencies()
const {
4963 return all_of(Bundle, [](
const ScheduleEntity *SD) {
4964 return SD->hasValidDependencies();
4970 bool isReady()
const {
4971 assert(*
this &&
"bundle must not be empty");
4972 return unscheduledDepsInBundle() == 0 && !isScheduled();
4980 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
4983 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
4984 TreeEntry *getTreeEntry()
const {
return TE; }
4986 static ScheduleBundle invalid() {
return {
false}; }
4988 operator bool()
const {
return IsValid; }
4991 void dump(raw_ostream &OS)
const {
5000 OS << *SD->getInst();
5014 const BoUpSLP::ScheduleBundle &Bundle) {
5025 class ScheduleCopyableData final :
public ScheduleEntity {
5032 int SchedulingRegionID = 0;
5034 ScheduleBundle &Bundle;
5037 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5038 const EdgeInfo &EI, ScheduleBundle &Bundle)
5039 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5040 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5041 static bool classof(
const ScheduleEntity *Entity) {
5042 return Entity->getKind() == Kind::ScheduleCopyableData;
5047 if (hasValidDependencies()) {
5048 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5050 assert(UnscheduledDeps == Dependencies &&
"invariant");
5054 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5055 "unexpected scheduled state");
5062 bool hasValidDependencies()
const {
5063 return Dependencies != ScheduleData::InvalidDeps;
5068 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5073 int incrementUnscheduledDeps(
int Incr) {
5074 assert(hasValidDependencies() &&
5075 "increment of unscheduled deps would be meaningless");
5076 UnscheduledDeps += Incr;
5077 assert(UnscheduledDeps >= 0 &&
"invariant");
5078 return UnscheduledDeps;
5083 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5086 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5088 int getDependencies()
const {
return Dependencies; }
5090 void initDependencies() { Dependencies = 0; }
5092 void incDependencies() { Dependencies++; }
5095 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5101 void clearDependencies() {
5102 Dependencies = ScheduleData::InvalidDeps;
5103 UnscheduledDeps = ScheduleData::InvalidDeps;
5104 IsScheduled =
false;
5108 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5111 ScheduleBundle &getBundle() {
return Bundle; }
5112 const ScheduleBundle &getBundle()
const {
return Bundle; }
5114#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5115 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5126 int Dependencies = ScheduleData::InvalidDeps;
5132 int UnscheduledDeps = ScheduleData::InvalidDeps;
5162 struct BlockScheduling {
5164 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5167 ScheduledBundles.clear();
5168 ScheduledBundlesList.
clear();
5169 ScheduleCopyableDataMap.clear();
5170 ScheduleCopyableDataMapByInst.clear();
5171 ScheduleCopyableDataMapByInstUser.clear();
5172 ScheduleCopyableDataMapByUsers.clear();
5174 ScheduleStart =
nullptr;
5175 ScheduleEnd =
nullptr;
5176 FirstLoadStoreInRegion =
nullptr;
5177 LastLoadStoreInRegion =
nullptr;
5178 RegionHasStackSave =
false;
5182 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5185 ScheduleRegionSize = 0;
5189 ++SchedulingRegionID;
5195 if (BB !=
I->getParent())
5198 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5199 if (SD && isInSchedulingRegion(*SD))
5204 ScheduleData *getScheduleData(
Value *V) {
5210 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5211 const Value *V)
const {
5212 if (ScheduleCopyableDataMap.empty())
5214 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5215 if (It == ScheduleCopyableDataMap.end())
5217 ScheduleCopyableData *SD = It->getSecond().get();
5218 if (!isInSchedulingRegion(*SD))
5226 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5228 if (ScheduleCopyableDataMapByInstUser.empty())
5230 const auto It = ScheduleCopyableDataMapByInstUser.find(
5231 std::make_pair(std::make_pair(User, OperandIdx), V));
5232 if (It == ScheduleCopyableDataMapByInstUser.end())
5235 for (ScheduleCopyableData *SD : It->getSecond()) {
5236 if (isInSchedulingRegion(*SD))
5250 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5254 if (ScheduleCopyableDataMap.empty())
5256 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5257 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5258 for (
const Use &U :
User->operands()) {
5262 if (Entries.
empty())
5266 for (TreeEntry *TE : Entries) {
5272 bool IsCommutativeUser =
5275 EdgeInfo EI(TE,
U.getOperandNo());
5278 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5279 if (!getScheduleCopyableData(EI,
Op) && OpCnt <
NumOps)
5285 ++PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5286 .first->getSecond();
5290 if (!PotentiallyReorderedEntriesCount.
empty()) {
5291 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5292 auto *It =
find(
P.first->Scalars, User);
5293 assert(It !=
P.first->Scalars.end() &&
5294 "User is not in the tree entry");
5295 int Lane = std::distance(
P.first->Scalars.begin(), It);
5296 assert(Lane >= 0 &&
"Lane is not found");
5298 Lane =
P.first->ReorderIndices[Lane];
5299 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5300 "Couldn't find extract lane");
5301 SmallVector<unsigned> OpIndices;
5302 for (
unsigned OpIdx :
5304 P.first->getMainOp()))) {
5305 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5306 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5310 return all_of(PotentiallyReorderedEntriesCount,
5311 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5312 return P.second ==
NumOps - 1;
5319 getScheduleCopyableData(
const Instruction *
I)
const {
5320 if (ScheduleCopyableDataMapByInst.empty())
5322 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5323 if (It == ScheduleCopyableDataMapByInst.end())
5326 for (ScheduleCopyableData *SD : It->getSecond()) {
5327 if (isInSchedulingRegion(*SD))
5334 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5335 if (ScheduleCopyableDataMapByUsers.empty())
5337 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5338 if (It == ScheduleCopyableDataMapByUsers.end())
5341 for (ScheduleCopyableData *SD : It->getSecond()) {
5342 if (isInSchedulingRegion(*SD))
5348 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5350 int SchedulingRegionID,
5351 ScheduleBundle &Bundle) {
5352 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5353 ScheduleCopyableData *CD =
5354 ScheduleCopyableDataMap
5355 .try_emplace(std::make_pair(EI,
I),
5356 std::make_unique<ScheduleCopyableData>(
5357 SchedulingRegionID,
I, EI, Bundle))
5360 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5364 assert(It !=
Op.end() &&
"Lane not set");
5365 SmallPtrSet<Instruction *, 4> Visited;
5367 int Lane = std::distance(
Op.begin(), It);
5368 assert(Lane >= 0 &&
"Lane not set");
5370 !EI.UserTE->ReorderIndices.empty())
5371 Lane = EI.UserTE->ReorderIndices[Lane];
5372 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5373 "Couldn't find extract lane");
5375 if (!Visited.
insert(In).second) {
5379 ScheduleCopyableDataMapByInstUser
5380 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5383 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5390 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5391 if (ScheduleCopyableData *UserCD =
5392 getScheduleCopyableData(UserEI, In))
5393 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5396 }
while (It !=
Op.end());
5398 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5408 auto It = ScheduledBundles.find(
I);
5409 if (It == ScheduledBundles.end())
5411 return It->getSecond();
5415 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5417 return Data->getSchedulingRegionID() == SchedulingRegionID;
5419 return CD->getSchedulingRegionID() == SchedulingRegionID;
5421 [&](
const ScheduleEntity *BundleMember) {
5422 return isInSchedulingRegion(*BundleMember);
5428 template <
typename ReadyListType>
5429 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5430 const EdgeInfo &EI, ScheduleEntity *
Data,
5431 ReadyListType &ReadyList) {
5432 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5437 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5438 if ((IsControl ||
Data->hasValidDependencies()) &&
5439 Data->incrementUnscheduledDeps(-1) == 0) {
5446 CopyableBundle.
push_back(&CD->getBundle());
5447 Bundles = CopyableBundle;
5449 Bundles = getScheduleBundles(
Data->getInst());
5451 if (!Bundles.
empty()) {
5452 for (ScheduleBundle *Bundle : Bundles) {
5453 if (Bundle->unscheduledDepsInBundle() == 0) {
5454 assert(!Bundle->isScheduled() &&
5455 "already scheduled bundle gets ready");
5456 ReadyList.insert(Bundle);
5458 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5464 "already scheduled bundle gets ready");
5466 "Expected non-copyable data");
5467 ReadyList.insert(
Data);
5474 if (!ScheduleCopyableDataMap.empty()) {
5476 getScheduleCopyableData(User,
OpIdx,
I);
5477 for (ScheduleCopyableData *CD : CopyableData)
5478 DecrUnsched(CD,
false);
5479 if (!CopyableData.empty())
5482 if (ScheduleData *OpSD = getScheduleData(
I))
5483 DecrUnsched(OpSD,
false);
5489 if (!Bundles.empty()) {
5490 auto *
In = BundleMember->getInst();
5492 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5493 unsigned TotalOpCount = 0;
5496 TotalOpCount = OperandsUses[
In] = 1;
5498 for (
const Use &U :
In->operands()) {
5501 ++Res.first->getSecond();
5508 auto DecrUnschedForInst = [&](
Instruction *
I, TreeEntry *UserTE,
5510 if (!ScheduleCopyableDataMap.empty()) {
5511 const EdgeInfo EI = {UserTE,
OpIdx};
5512 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI,
I)) {
5513 DecrUnsched(CD,
false);
5517 auto It = OperandsUses.
find(
I);
5518 assert(It != OperandsUses.
end() &&
"Operand not found");
5519 if (It->second > 0) {
5521 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5523 if (ScheduleData *OpSD = getScheduleData(
I))
5524 DecrUnsched(OpSD,
false);
5528 for (ScheduleBundle *Bundle : Bundles) {
5529 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5533 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5534 find(Bundle->getTreeEntry()->Scalars, In));
5535 assert(Lane >= 0 &&
"Lane not set");
5537 !Bundle->getTreeEntry()->ReorderIndices.empty())
5538 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5539 assert(Lane <
static_cast<int>(
5540 Bundle->getTreeEntry()->Scalars.size()) &&
5541 "Couldn't find extract lane");
5551 In->getNumOperands() ==
5552 Bundle->getTreeEntry()->getNumOperands() ||
5553 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5554 "Missed TreeEntry operands?");
5556 for (
unsigned OpIdx :
5559 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5562 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx);
5568 for (Use &U : BundleMember->getInst()->operands()) {
5571 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5572 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5580 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5581 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5582 if (!VisitedMemory.
insert(MemoryDep).second)
5587 << *MemoryDep <<
"\n");
5588 DecrUnsched(MemoryDep);
5591 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5592 for (ScheduleData *Dep : SD->getControlDependencies()) {
5593 if (!VisitedControl.
insert(Dep).second)
5598 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5599 DecrUnsched(Dep,
true);
5603 SD->setScheduled(
true);
5608 if (
R.isVectorized(In)) {
5610 for (TreeEntry *TE : Entries) {
5612 In->getNumOperands() !=
TE->getNumOperands())
5615 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5616 BundlePtr->setTreeEntry(TE);
5621 ProcessBundleMember(SD, Bundles);
5624 Bundle.setScheduled(
true);
5626 auto AreAllBundlesScheduled =
5627 [&](
const ScheduleEntity *SD,
5631 return !SDBundles.empty() &&
5632 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5633 return SDBundle->isScheduled();
5636 for (ScheduleEntity *SD : Bundle.getBundle()) {
5639 SDBundles = getScheduleBundles(SD->getInst());
5640 if (AreAllBundlesScheduled(SD, SDBundles)) {
5641 SD->setScheduled(
true);
5654 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5655 ScheduleStart->comesBefore(ScheduleEnd) &&
5656 "Not a valid scheduling region?");
5658 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5660 if (!Bundles.
empty()) {
5661 for (ScheduleBundle *Bundle : Bundles) {
5662 assert(isInSchedulingRegion(*Bundle) &&
5663 "primary schedule data not in window?");
5668 auto *SD = getScheduleData(
I);
5671 assert(isInSchedulingRegion(*SD) &&
5672 "primary schedule data not in window?");
5677 [](
const ScheduleEntity *Bundle) {
5678 return Bundle->isReady();
5680 "item in ready list not ready?");
5684 template <
typename ReadyListType>
5685 void initialFillReadyList(ReadyListType &ReadyList) {
5686 SmallPtrSet<ScheduleBundle *, 16> Visited;
5687 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5688 ScheduleData *SD = getScheduleData(
I);
5689 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5692 for (ScheduleBundle *Bundle : Bundles) {
5693 if (!Visited.
insert(Bundle).second)
5695 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5696 ReadyList.insert(Bundle);
5698 << *Bundle <<
"\n");
5703 ReadyList.insert(SD);
5705 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5716 const InstructionsState &S,
const EdgeInfo &EI);
5723 std::optional<ScheduleBundle *>
5725 const InstructionsState &S,
const EdgeInfo &EI);
5728 ScheduleData *allocateScheduleDataChunks();
5732 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5736 void initScheduleData(Instruction *FromI, Instruction *ToI,
5737 ScheduleData *PrevLoadStore,
5738 ScheduleData *NextLoadStore);
5742 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5747 void resetSchedule();
5764 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5768 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5769 std::unique_ptr<ScheduleCopyableData>>
5770 ScheduleCopyableDataMap;
5776 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5777 ScheduleCopyableDataMapByInst;
5783 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5785 ScheduleCopyableDataMapByInstUser;
5805 SmallSetVector<ScheduleCopyableData *, 4>>
5806 ScheduleCopyableDataMapByUsers;
5809 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5815 SetVector<ScheduleEntity *> ReadyInsts;
5825 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5829 ScheduleData *LastLoadStoreInRegion =
nullptr;
5834 bool RegionHasStackSave =
false;
5837 int ScheduleRegionSize = 0;
5846 int SchedulingRegionID = 1;
5850 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5854 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
5857 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
5861 struct OrdersTypeDenseMapInfo {
5874 static unsigned getHashValue(
const OrdersType &V) {
5885 ScalarEvolution *SE;
5886 TargetTransformInfo *TTI;
5887 TargetLibraryInfo *TLI;
5890 AssumptionCache *AC;
5892 const DataLayout *DL;
5893 OptimizationRemarkEmitter *ORE;
5895 unsigned MaxVecRegSize;
5896 unsigned MinVecRegSize;
5899 IRBuilder<TargetFolder> Builder;
5906 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
5911 unsigned ReductionBitWidth = 0;
5914 unsigned BaseGraphSize = 1;
5918 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5922 DenseSet<unsigned> ExtraBitWidthNodes;
5932 SecondInfo::getEmptyKey());
5937 SecondInfo::getTombstoneKey());
5942 SecondInfo::getHashValue(Val.
EdgeIdx));
5963 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5974 return R.VectorizableTree[0].get();
5978 return {&
N->UserTreeIndex,
N->Container};
5982 return {&
N->UserTreeIndex + 1,
N->Container};
6009 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6020 OS << Entry->Idx <<
".\n";
6023 for (
auto *V : Entry->Scalars) {
6025 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6026 return EU.Scalar == V;
6036 if (Entry->isGather())
6038 if (Entry->State == TreeEntry::ScatterVectorize ||
6039 Entry->State == TreeEntry::StridedVectorize ||
6040 Entry->State == TreeEntry::CompressVectorize)
6041 return "color=blue";
6050 for (
auto *
I : DeletedInstructions) {
6051 if (!
I->getParent()) {
6056 I->insertBefore(F->getEntryBlock(),
6057 F->getEntryBlock().getFirstNonPHIIt());
6059 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6062 for (
Use &U :
I->operands()) {
6064 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6068 I->dropAllReferences();
6070 for (
auto *
I : DeletedInstructions) {
6072 "trying to erase instruction with users.");
6073 I->eraseFromParent();
6079#ifdef EXPENSIVE_CHECKS
6090 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6091 "Expected non-empty mask.");
6094 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6096 Reuses[Mask[
I]] = Prev[
I];
6104 bool BottomOrder =
false) {
6105 assert(!Mask.empty() &&
"Expected non-empty mask.");
6106 unsigned Sz = Mask.size();
6109 if (Order.
empty()) {
6111 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6113 PrevOrder.
swap(Order);
6116 for (
unsigned I = 0;
I < Sz; ++
I)
6118 Order[
I] = PrevOrder[Mask[
I]];
6120 return Data.value() == Sz ||
Data.index() ==
Data.value();
6129 if (Order.
empty()) {
6131 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6141 for (
unsigned I = 0;
I < Sz; ++
I)
6143 Order[MaskOrder[
I]] =
I;
6147std::optional<BoUpSLP::OrdersType>
6149 bool TopToBottom,
bool IgnoreReorder) {
6150 assert(TE.isGather() &&
"Expected gather node only.");
6154 Type *ScalarTy = GatheredScalars.
front()->getType();
6155 size_t NumScalars = GatheredScalars.
size();
6157 return std::nullopt;
6164 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6166 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6169 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6170 return std::nullopt;
6171 OrdersType CurrentOrder(NumScalars, NumScalars);
6172 if (GatherShuffles.
size() == 1 &&
6174 Entries.
front().front()->isSame(TE.Scalars)) {
6178 return std::nullopt;
6180 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6181 TE.UserTreeIndex.UserTE)
6182 return std::nullopt;
6185 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6186 return std::nullopt;
6189 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6190 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6193 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6195 return std::nullopt;
6199 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6200 return CurrentOrder;
6204 return all_of(Mask, [&](
int I) {
6211 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6212 (Entries.
size() != 1 ||
6213 Entries.
front().front()->ReorderIndices.empty())) ||
6214 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6215 return std::nullopt;
6221 if (ShuffledSubMasks.
test(
I))
6223 const int VF = GetVF(
I);
6229 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6231 ShuffledSubMasks.
set(
I);
6235 int FirstMin = INT_MAX;
6236 int SecondVecFound =
false;
6238 int Idx = Mask[
I * PartSz + K];
6240 Value *V = GatheredScalars[
I * PartSz + K];
6242 SecondVecFound =
true;
6251 SecondVecFound =
true;
6255 FirstMin = (FirstMin / PartSz) * PartSz;
6257 if (SecondVecFound) {
6259 ShuffledSubMasks.
set(
I);
6263 int Idx = Mask[
I * PartSz + K];
6267 if (Idx >= PartSz) {
6268 SecondVecFound =
true;
6271 if (CurrentOrder[
I * PartSz + Idx] >
6272 static_cast<unsigned>(
I * PartSz + K) &&
6273 CurrentOrder[
I * PartSz + Idx] !=
6274 static_cast<unsigned>(
I * PartSz + Idx))
6275 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6278 if (SecondVecFound) {
6280 ShuffledSubMasks.
set(
I);
6286 if (!ExtractShuffles.
empty())
6287 TransformMaskToOrder(
6288 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6289 if (!ExtractShuffles[
I])
6292 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6294 int K =
I * PartSz + Idx;
6297 if (!TE.ReuseShuffleIndices.empty())
6298 K = TE.ReuseShuffleIndices[K];
6301 if (!TE.ReorderIndices.empty())
6302 K = std::distance(TE.ReorderIndices.begin(),
6303 find(TE.ReorderIndices, K));
6309 .getKnownMinValue());
6314 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6315 if (ShuffledSubMasks.
any())
6316 return std::nullopt;
6317 PartSz = NumScalars;
6320 if (!Entries.
empty())
6321 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6322 if (!GatherShuffles[
I])
6324 return std::max(Entries[
I].front()->getVectorFactor(),
6325 Entries[
I].back()->getVectorFactor());
6327 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6328 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6329 return std::nullopt;
6330 return std::move(CurrentOrder);
6335 bool CompareOpcodes =
true) {
6341 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6342 (!GEP2 || GEP2->getNumOperands() == 2) &&
6343 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6344 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6347 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6351template <
typename T>
6356 return CommonAlignment;
6362 "Order is empty. Please check it before using isReverseOrder.");
6363 unsigned Sz = Order.
size();
6365 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6376 const SCEV *PtrSCEVLowest =
nullptr;
6377 const SCEV *PtrSCEVHighest =
nullptr;
6385 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6386 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6393 PtrSCEVLowest = PtrSCEV;
6400 PtrSCEVHighest = PtrSCEV;
6408 int Size =
DL.getTypeStoreSize(ElemTy);
6409 auto TryGetStride = [&](
const SCEV *Dist,
6410 const SCEV *Multiplier) ->
const SCEV * {
6412 if (M->getOperand(0) == Multiplier)
6413 return M->getOperand(1);
6414 if (M->getOperand(1) == Multiplier)
6415 return M->getOperand(0);
6418 if (Multiplier == Dist)
6423 const SCEV *Stride =
nullptr;
6424 if (
Size != 1 || SCEVs.
size() > 2) {
6426 Stride = TryGetStride(Dist, Sz);
6434 using DistOrdPair = std::pair<int64_t, int>;
6436 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6438 bool IsConsecutive =
true;
6439 for (
const SCEV *PtrSCEV : SCEVs) {
6441 if (PtrSCEV != PtrSCEVLowest) {
6443 const SCEV *Coeff = TryGetStride(Diff, Stride);
6453 Dist = SC->getAPInt().getZExtValue();
6458 auto Res = Offsets.emplace(Dist, Cnt);
6462 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6465 if (Offsets.size() != SCEVs.
size())
6467 SortedIndices.
clear();
6468 if (!IsConsecutive) {
6472 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6473 SortedIndices[Cnt] = Pair.second;
6480static std::pair<InstructionCost, InstructionCost>
6483 Type *ScalarTy, VectorType *VecTy);
6501 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6504 Mask, NumSrcElts, NumSubElts, Index)) {
6505 if (Index + NumSubElts > NumSrcElts &&
6506 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6523 "ScalableVectorType is not supported.");
6526 "Incorrect usage.");
6531 unsigned ScalarTyNumElements = VecTy->getNumElements();
6534 if (!DemandedElts[
I])
6538 I * ScalarTyNumElements, VecTy);
6541 I * ScalarTyNumElements, VecTy);
6554 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6555 if (Opcode == Instruction::ExtractElement) {
6561 Index * VecTy->getNumElements(), VecTy);
6564 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6577 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6579 Index * ScalarTy->getNumElements(), SubTp) +
6583 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6599 auto *Begin = std::next(
Mask.begin(), Index);
6600 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6604 std::iota(
Mask.begin(),
Mask.end(), 0);
6605 std::iota(std::next(
Mask.begin(), Index),
6606 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6608 return Generator(Vec, V, Mask);
6611 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6619 unsigned SubVecVF,
unsigned Index) {
6621 std::iota(Mask.begin(), Mask.end(), Index);
6622 return Builder.CreateShuffleVector(Vec, Mask);
6632 const unsigned Sz = PointerOps.
size();
6635 CompressMask[0] = 0;
6637 std::optional<unsigned> Stride = 0;
6641 std::optional<int64_t> OptPos =
6643 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6645 unsigned Pos =
static_cast<unsigned>(*OptPos);
6646 CompressMask[
I] = Pos;
6653 if (Pos != *Stride *
I)
6656 return Stride.has_value();
6669 InterleaveFactor = 0;
6671 const size_t Sz = VL.
size();
6679 if (AreAllUsersVectorized(V))
6682 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6683 Mask.empty() ?
I : Mask[
I]);
6686 if (ExtractCost <= ScalarCost)
6691 if (Order.
empty()) {
6692 Ptr0 = PointerOps.
front();
6693 PtrN = PointerOps.
back();
6695 Ptr0 = PointerOps[Order.
front()];
6696 PtrN = PointerOps[Order.
back()];
6698 std::optional<int64_t> Diff =
6702 const size_t MaxRegSize =
6706 if (*Diff / Sz >= MaxRegSize / 8)
6710 Align CommonAlignment = LI->getAlign();
6712 Ptr0, LoadVecTy, CommonAlignment,
DL,
6715 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6716 LI->getPointerAddressSpace()))
6722 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6726 auto [ScalarGEPCost, VectorGEPCost] =
6728 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6746 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6747 LI->getPointerAddressSpace(),
CostKind);
6750 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6751 LI->getPointerAddressSpace(),
CostKind);
6753 if (IsStrided && !IsMasked && Order.
empty()) {
6760 AlignedLoadVecTy = LoadVecTy;
6761 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6763 LI->getPointerAddressSpace())) {
6765 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6766 Instruction::Load, AlignedLoadVecTy,
6767 CompressMask[1], {}, CommonAlignment,
6768 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6769 if (InterleavedCost < GatherCost) {
6770 InterleaveFactor = CompressMask[1];
6771 LoadVecTy = AlignedLoadVecTy;
6778 if (!Order.
empty()) {
6781 NewMask[
I] = CompressMask[Mask[
I]];
6783 CompressMask.
swap(NewMask);
6785 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6786 return TotalVecCost < GatherCost;
6799 unsigned InterleaveFactor;
6803 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6804 CompressMask, LoadVecTy);
6825 StridedPtrInfo &SPtrInfo)
const {
6826 const size_t Sz = VL.
size();
6827 if (Diff % (Sz - 1) != 0)
6831 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
6833 return !isVectorized(U) && !MustGather.contains(U);
6837 const uint64_t AbsoluteDiff = std::abs(Diff);
6840 if (IsAnyPointerUsedOutGraph ||
6841 (AbsoluteDiff > Sz &&
6844 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
6845 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
6846 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6847 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
6852 if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
6856 if (Order.
empty()) {
6857 Ptr0 = PointerOps.
front();
6858 PtrN = PointerOps.
back();
6860 Ptr0 = PointerOps[Order.
front()];
6861 PtrN = PointerOps[Order.
back()];
6870 else if (
Ptr != Ptr0)
6874 if (((Dist / Stride) * Stride) != Dist || !Dists.
insert(Dist).second)
6877 if (Dists.
size() == Sz) {
6878 Type *StrideTy = DL.getIndexType(Ptr0->
getType());
6879 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6890 unsigned *BestVF,
bool TryRecursiveCheck)
const {
6903 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
6909 const size_t Sz = VL.
size();
6911 auto *POIter = PointerOps.
begin();
6912 for (
Value *V : VL) {
6914 if (!L || !L->isSimple())
6916 *POIter = L->getPointerOperand();
6922 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
6928 if (
const SCEV *Stride =
6930 Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
6932 SPtrInfo.StrideSCEV = Stride;
6937 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6938 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6949 if (Order.
empty()) {
6950 Ptr0 = PointerOps.
front();
6951 PtrN = PointerOps.
back();
6953 Ptr0 = PointerOps[Order.
front()];
6954 PtrN = PointerOps[Order.
back()];
6956 std::optional<int64_t> Diff =
6959 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
6962 *TLI, [&](
Value *V) {
6963 return areAllUsersVectorized(
6967 if (
isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE, *Diff, SPtrInfo))
6970 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6971 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6976 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
6978 bool ProfitableGatherPointers) {
6983 auto [ScalarGEPCost, VectorGEPCost] =
6985 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
6989 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
6991 if (
static_cast<unsigned>(
count_if(
7010 return C + TTI.getInstructionCost(
7016 TTI.getGatherScatterOpCost(
7018 false, CommonAlignment,
CostKind) +
7019 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7027 constexpr unsigned ListLimit = 4;
7028 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7037 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7047 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7052 PointerOps, SPtrInfo, BestVF,
7060 DemandedElts.
setBits(Cnt, Cnt + VF);
7076 if (!DemandedElts.
isZero()) {
7082 if (DemandedElts[Idx])
7093 LI0->getPointerOperand(),
7094 Instruction::GetElementPtr,
CostKind, ScalarTy,
7098 if (
static_cast<unsigned>(
7100 PointerOps.
size() - 1 ||
7119 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7120 LI0->getPointerAddressSpace(),
CostKind,
7125 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7126 LI0->getPointerOperand(),
7132 VecLdCost += TTI.getMaskedMemoryOpCost(
7133 Instruction::Load, SubVecTy, CommonAlignment,
7134 LI0->getPointerAddressSpace(),
CostKind) +
7140 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7141 LI0->getPointerOperand(),
7152 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7161 if (MaskedGatherCost >= VecLdCost &&
7174 bool ProfitableGatherPointers =
7175 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7176 return L->isLoopInvariant(V);
7178 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7181 (
GEP &&
GEP->getNumOperands() == 2 &&
7189 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7190 ProfitableGatherPointers))
7202 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7203 "Expected list of pointer operands.");
7208 std::pair<BasicBlock *, Value *>,
7212 .try_emplace(std::make_pair(
7216 SortedIndices.
clear();
7218 auto Key = std::make_pair(BBs[Cnt + 1],
7220 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7222 std::optional<int64_t> Diff =
7223 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7224 ElemTy, Ptr, DL, SE,
7229 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7235 if (Bases.size() > VL.
size() / 2 - 1)
7239 Bases.find(
Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
7243 if (Bases.size() == VL.
size())
7246 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7247 Bases.front().second.size() == VL.
size()))
7252 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7261 FirstPointers.
insert(P1);
7262 SecondPointers.
insert(P2);
7268 "Unable to find matching root.");
7271 for (
auto &
Base : Bases) {
7272 for (
auto &Vec :
Base.second) {
7273 if (Vec.size() > 1) {
7275 int64_t InitialOffset = std::get<1>(Vec[0]);
7276 bool AnyConsecutive =
7278 return std::get<1>(
P.value()) ==
7279 int64_t(
P.index()) + InitialOffset;
7283 if (!AnyConsecutive)
7288 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7292 for (
auto &
T : Bases)
7293 for (
const auto &Vec :
T.second)
7294 for (
const auto &
P : Vec)
7298 "Expected SortedIndices to be the size of VL");
7302std::optional<BoUpSLP::OrdersType>
7304 assert(TE.isGather() &&
"Expected gather node only.");
7305 Type *ScalarTy = TE.Scalars[0]->getType();
7308 Ptrs.
reserve(TE.Scalars.size());
7310 BBs.
reserve(TE.Scalars.size());
7311 for (
Value *V : TE.Scalars) {
7313 if (!L || !L->isSimple())
7314 return std::nullopt;
7320 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7322 return std::move(Order);
7323 return std::nullopt;
7334 if (VU->
getType() != V->getType())
7337 if (!VU->
hasOneUse() && !V->hasOneUse())
7343 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7350 bool IsReusedIdx =
false;
7352 if (IE2 == VU && !IE1)
7354 if (IE1 == V && !IE2)
7355 return V->hasOneUse();
7356 if (IE1 && IE1 != V) {
7358 IsReusedIdx |= ReusedIdx.
test(Idx1);
7359 ReusedIdx.
set(Idx1);
7360 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7365 if (IE2 && IE2 != VU) {
7367 IsReusedIdx |= ReusedIdx.
test(Idx2);
7368 ReusedIdx.
set(Idx2);
7369 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7374 }
while (!IsReusedIdx && (IE1 || IE2));
7382 const TargetLibraryInfo &TLI);
7384std::optional<BoUpSLP::OrdersType>
7386 bool IgnoreReorder) {
7389 if (!TE.ReuseShuffleIndices.empty()) {
7391 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7392 "Reshuffling scalars not yet supported for nodes with padding");
7395 return std::nullopt;
7403 unsigned Sz = TE.Scalars.size();
7404 if (TE.isGather()) {
7405 if (std::optional<OrdersType> CurrentOrder =
7410 ::addMask(Mask, TE.ReuseShuffleIndices);
7411 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7412 unsigned Sz = TE.Scalars.size();
7413 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7416 Res[Idx + K * Sz] =
I + K * Sz;
7418 return std::move(Res);
7421 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7423 2 * TE.getVectorFactor())) == 1)
7424 return std::nullopt;
7425 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7426 return std::nullopt;
7430 if (TE.ReorderIndices.empty())
7431 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7434 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7435 unsigned VF = ReorderMask.
size();
7439 for (
unsigned I = 0;
I < VF;
I += Sz) {
7441 unsigned UndefCnt = 0;
7442 unsigned Limit = std::min(Sz, VF -
I);
7451 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7453 return std::nullopt;
7455 for (
unsigned K = 0; K < NumParts; ++K) {
7456 unsigned Idx = Val + Sz * K;
7457 if (Idx < VF &&
I + K < VF)
7458 ResOrder[Idx] =
I + K;
7461 return std::move(ResOrder);
7463 unsigned VF = TE.getVectorFactor();
7466 TE.ReuseShuffleIndices.end());
7467 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7469 if (isa<PoisonValue>(V))
7471 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7472 return Idx && *Idx < Sz;
7474 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7475 "by BinaryOperator and CastInst.");
7477 if (TE.ReorderIndices.empty())
7478 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7481 for (
unsigned I = 0;
I < VF; ++
I) {
7482 int &Idx = ReusedMask[
I];
7485 Value *V = TE.Scalars[ReorderMask[Idx]];
7487 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7493 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7494 auto *It = ResOrder.
begin();
7495 for (
unsigned K = 0; K < VF; K += Sz) {
7499 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7501 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7502 std::advance(It, Sz);
7505 return Data.index() ==
Data.value();
7507 return std::nullopt;
7508 return std::move(ResOrder);
7510 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7511 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7513 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7514 return std::nullopt;
7515 if (TE.State == TreeEntry::SplitVectorize ||
7516 ((TE.State == TreeEntry::Vectorize ||
7517 TE.State == TreeEntry::StridedVectorize ||
7518 TE.State == TreeEntry::CompressVectorize) &&
7521 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7522 "Alternate instructions are only supported by "
7523 "BinaryOperator and CastInst.");
7524 return TE.ReorderIndices;
7526 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7527 TE.isAltShuffle()) {
7528 assert(TE.ReuseShuffleIndices.empty() &&
7529 "ReuseShuffleIndices should be "
7530 "empty for alternate instructions.");
7532 TE.buildAltOpShuffleMask(
7534 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7535 "Unexpected main/alternate opcode");
7539 const int VF = TE.getVectorFactor();
7544 ResOrder[Mask[
I] % VF] =
I;
7546 return std::move(ResOrder);
7548 if (!TE.ReorderIndices.empty())
7549 return TE.ReorderIndices;
7550 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7551 if (!TE.ReorderIndices.empty())
7552 return TE.ReorderIndices;
7555 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7563 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7571 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7572 if (!DT->isReachableFromEntry(BB1))
7574 if (!DT->isReachableFromEntry(BB2))
7576 auto *NodeA = DT->getNode(BB1);
7577 auto *NodeB = DT->getNode(BB2);
7578 assert(NodeA &&
"Should only process reachable instructions");
7579 assert(NodeB &&
"Should only process reachable instructions");
7580 assert((NodeA == NodeB) ==
7581 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7582 "Different nodes should have different DFS numbers");
7583 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7585 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7586 Value *V1 = TE.Scalars[I1];
7587 Value *V2 = TE.Scalars[I2];
7600 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7601 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7602 FirstUserOfPhi2->getParent());
7612 if (UserBVHead[I1] && !UserBVHead[I2])
7614 if (!UserBVHead[I1])
7616 if (UserBVHead[I1] == UserBVHead[I2])
7619 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7621 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7634 if (EE1->getOperand(0) == EE2->getOperand(0))
7636 if (!Inst1 && Inst2)
7638 if (Inst1 && Inst2) {
7646 "Expected either instructions or arguments vector operands.");
7647 return P1->getArgNo() < P2->getArgNo();
7652 std::iota(Phis.
begin(), Phis.
end(), 0);
7655 return std::nullopt;
7656 return std::move(Phis);
7658 if (TE.isGather() &&
7659 (!TE.hasState() || !TE.isAltShuffle() ||
7660 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7664 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7668 auto *EE = dyn_cast<ExtractElementInst>(V);
7669 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7675 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7676 if (Reuse || !CurrentOrder.
empty())
7677 return std::move(CurrentOrder);
7685 int Sz = TE.Scalars.size();
7689 if (It == TE.Scalars.begin())
7692 if (It != TE.Scalars.end()) {
7694 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7709 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7712 return std::move(Order);
7717 return std::nullopt;
7718 if (TE.Scalars.size() >= 3)
7723 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7725 StridedPtrInfo SPtrInfo;
7728 CurrentOrder, PointerOps, SPtrInfo);
7731 return std::move(CurrentOrder);
7736 if (std::optional<OrdersType> CurrentOrder =
7738 return CurrentOrder;
7740 return std::nullopt;
7750 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7752 if (Cluster != FirstCluster)
7758void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask)
const {
7761 const unsigned Sz =
TE.Scalars.size();
7763 if (!
TE.isGather() ||
7768 SmallVector<int> NewMask;
7770 addMask(NewMask,
TE.ReuseShuffleIndices);
7772 TE.ReorderIndices.clear();
7774 ArrayRef<int> Slice =
ArrayRef(NewMask).slice(0, Sz);
7775 SmallVector<unsigned> NewOrder(Slice);
7779 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
7780 *End =
TE.ReuseShuffleIndices.end();
7781 It != End; std::advance(It, Sz))
7782 std::iota(It, std::next(It, Sz), 0);
7788 "Expected same size of orders");
7789 size_t Sz = Order.
size();
7792 if (Order[Idx] != Sz)
7793 UsedIndices.
set(Order[Idx]);
7795 if (SecondaryOrder.
empty()) {
7797 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
7801 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7802 !UsedIndices.
test(SecondaryOrder[Idx]))
7803 Order[Idx] = SecondaryOrder[Idx];
7811 constexpr unsigned TinyVF = 2;
7812 constexpr unsigned TinyTree = 10;
7813 constexpr unsigned PhiOpsLimit = 12;
7814 constexpr unsigned GatherLoadsLimit = 2;
7815 if (VectorizableTree.size() <= TinyTree)
7817 if (VectorizableTree.front()->hasState() &&
7818 !VectorizableTree.front()->isGather() &&
7819 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7820 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7821 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7822 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7823 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7824 VectorizableTree.front()->ReorderIndices.empty()) {
7828 if (VectorizableTree.front()->hasState() &&
7829 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7830 VectorizableTree.front()->Scalars.size() == TinyVF &&
7831 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7834 if (VectorizableTree.front()->hasState() &&
7835 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7836 VectorizableTree.front()->ReorderIndices.empty()) {
7837 const unsigned ReorderedSplitsCnt =
7838 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7839 return TE->State == TreeEntry::SplitVectorize &&
7840 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7841 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7844 if (ReorderedSplitsCnt <= 1 &&
7846 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7847 return ((!TE->isGather() &&
7848 (TE->ReorderIndices.empty() ||
7849 (TE->UserTreeIndex.UserTE &&
7850 TE->UserTreeIndex.UserTE->State ==
7851 TreeEntry::Vectorize &&
7852 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7854 (TE->isGather() && TE->ReorderIndices.empty() &&
7855 (!TE->hasState() || TE->isAltShuffle() ||
7856 TE->getOpcode() == Instruction::Load ||
7857 TE->getOpcode() == Instruction::ZExt ||
7858 TE->getOpcode() == Instruction::SExt))) &&
7859 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7860 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
7861 return !isConstant(V) && isVectorized(V);
7863 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7866 bool HasPhis =
false;
7867 bool HasLoad =
true;
7868 unsigned GatherLoads = 0;
7869 for (
const std::unique_ptr<TreeEntry> &TE :
7870 ArrayRef(VectorizableTree).drop_front()) {
7871 if (TE->State == TreeEntry::SplitVectorize)
7873 if (!TE->hasState()) {
7877 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7882 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7883 if (!TE->isGather()) {
7890 if (GatherLoads >= GatherLoadsLimit)
7893 if (TE->getOpcode() == Instruction::GetElementPtr ||
7896 if (TE->getOpcode() != Instruction::PHI &&
7897 (!TE->hasCopyableElements() ||
7899 TE->Scalars.size() / 2))
7901 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7902 TE->getNumOperands() > PhiOpsLimit)
7911void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
7913 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
7916 std::iota(NewMask.
begin(), NewMask.
end(), 0);
7917 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7920 copy(MaskOrder, NewMaskOrder.begin());
7922 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
7923 unsigned Offset = CombinedEntriesWithIndices.
back().second;
7932 ReorderIndices.clear();
7951 ExternalUserReorderMap;
7955 for_each(VectorizableTree, [&, &TTIRef = *TTI](
7956 const std::unique_ptr<TreeEntry> &TE) {
7959 findExternalStoreUsersReorderIndices(TE.get());
7960 if (!ExternalUserReorderIndices.
empty()) {
7961 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7963 std::move(ExternalUserReorderIndices));
7969 if (TE->hasState() && TE->isAltShuffle() &&
7970 TE->State != TreeEntry::SplitVectorize) {
7971 Type *ScalarTy = TE->Scalars[0]->getType();
7973 unsigned Opcode0 = TE->getOpcode();
7974 unsigned Opcode1 = TE->getAltOpcode();
7978 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7979 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
7985 bool IgnoreReorder =
7986 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7987 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
7988 VectorizableTree.front()->getOpcode() == Instruction::Store);
7989 if (std::optional<OrdersType> CurrentOrder =
7999 const TreeEntry *UserTE = TE.get();
8001 if (!UserTE->UserTreeIndex)
8003 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8004 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8005 UserTE->UserTreeIndex.UserTE->Idx != 0)
8007 UserTE = UserTE->UserTreeIndex.UserTE;
8010 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8011 if (!(TE->State == TreeEntry::Vectorize ||
8012 TE->State == TreeEntry::StridedVectorize ||
8013 TE->State == TreeEntry::SplitVectorize ||
8014 TE->State == TreeEntry::CompressVectorize) ||
8015 !TE->ReuseShuffleIndices.empty())
8016 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8017 if (TE->State == TreeEntry::Vectorize &&
8018 TE->getOpcode() == Instruction::PHI)
8019 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8024 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8025 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8026 auto It = VFToOrderedEntries.
find(VF);
8027 if (It == VFToOrderedEntries.
end())
8041 for (
const TreeEntry *OpTE : OrderedEntries) {
8044 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8045 OpTE->State != TreeEntry::SplitVectorize)
8048 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8050 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8051 auto It = GathersToOrders.find(OpTE);
8052 if (It != GathersToOrders.end())
8055 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8056 auto It = AltShufflesToOrders.find(OpTE);
8057 if (It != AltShufflesToOrders.end())
8060 if (OpTE->State == TreeEntry::Vectorize &&
8061 OpTE->getOpcode() == Instruction::PHI) {
8062 auto It = PhisToOrders.
find(OpTE);
8063 if (It != PhisToOrders.
end())
8066 return OpTE->ReorderIndices;
8069 auto It = ExternalUserReorderMap.
find(OpTE);
8070 if (It != ExternalUserReorderMap.
end()) {
8071 const auto &ExternalUserReorderIndices = It->second;
8075 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8076 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8077 ExternalUserReorderIndices.size();
8079 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8080 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8087 if (OpTE->State == TreeEntry::Vectorize &&
8088 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8089 assert(!OpTE->isAltShuffle() &&
8090 "Alternate instructions are only supported by BinaryOperator "
8094 unsigned E = Order.
size();
8097 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8100 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8102 ++OrdersUses.try_emplace(Order, 0).first->second;
8105 if (OrdersUses.empty())
8108 unsigned IdentityCnt = 0;
8109 unsigned FilledIdentityCnt = 0;
8111 for (
auto &Pair : OrdersUses) {
8113 if (!Pair.first.empty())
8114 FilledIdentityCnt += Pair.second;
8115 IdentityCnt += Pair.second;
8120 unsigned Cnt = IdentityCnt;
8121 for (
auto &Pair : OrdersUses) {
8125 if (Cnt < Pair.second ||
8126 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8127 Cnt == Pair.second && !BestOrder.
empty() &&
8130 BestOrder = Pair.first;
8143 unsigned E = BestOrder.
size();
8145 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8148 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8150 if (TE->Scalars.size() != VF) {
8151 if (TE->ReuseShuffleIndices.size() == VF) {
8152 assert(TE->State != TreeEntry::SplitVectorize &&
8153 "Split vectorized not expected.");
8158 (!TE->UserTreeIndex ||
8159 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8160 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8161 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8162 "All users must be of VF size.");
8169 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8175 reorderNodeWithReuses(*TE, Mask);
8177 if (TE->UserTreeIndex &&
8178 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8179 TE->UserTreeIndex.UserTE->reorderSplitNode(
8180 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8184 if ((TE->State == TreeEntry::SplitVectorize &&
8185 TE->ReuseShuffleIndices.empty()) ||
8186 ((TE->State == TreeEntry::Vectorize ||
8187 TE->State == TreeEntry::StridedVectorize ||
8188 TE->State == TreeEntry::CompressVectorize) &&
8193 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8194 TE->ReuseShuffleIndices.empty())) &&
8195 "Alternate instructions are only supported by BinaryOperator "
8201 TE->reorderOperands(Mask);
8204 TE->reorderOperands(Mask);
8205 assert(TE->ReorderIndices.empty() &&
8206 "Expected empty reorder sequence.");
8209 if (!TE->ReuseShuffleIndices.empty()) {
8216 addMask(NewReuses, TE->ReuseShuffleIndices);
8217 TE->ReuseShuffleIndices.swap(NewReuses);
8218 }
else if (TE->UserTreeIndex &&
8219 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8221 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8227void BoUpSLP::buildReorderableOperands(
8228 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8232 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8233 return OpData.first ==
I &&
8234 (OpData.second->State == TreeEntry::Vectorize ||
8235 OpData.second->State == TreeEntry::StridedVectorize ||
8236 OpData.second->State == TreeEntry::CompressVectorize ||
8237 OpData.second->State == TreeEntry::SplitVectorize);
8241 if (UserTE->hasState()) {
8242 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8243 UserTE->getOpcode() == Instruction::ExtractValue)
8245 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8247 if (UserTE->getOpcode() == Instruction::Store &&
8248 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8250 if (UserTE->getOpcode() == Instruction::Load &&
8251 (UserTE->State == TreeEntry::Vectorize ||
8252 UserTE->State == TreeEntry::StridedVectorize ||
8253 UserTE->State == TreeEntry::CompressVectorize))
8256 TreeEntry *TE = getOperandEntry(UserTE,
I);
8257 assert(TE &&
"Expected operand entry.");
8258 if (!TE->isGather()) {
8261 Edges.emplace_back(
I, TE);
8267 if (TE->State == TreeEntry::ScatterVectorize &&
8268 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8272 if (ReorderableGathers.
contains(TE))
8278 struct TreeEntryCompare {
8279 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8280 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8281 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8282 return LHS->Idx < RHS->Idx;
8291 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8292 if (TE->State != TreeEntry::Vectorize &&
8293 TE->State != TreeEntry::StridedVectorize &&
8294 TE->State != TreeEntry::CompressVectorize &&
8295 TE->State != TreeEntry::SplitVectorize)
8296 NonVectorized.
insert(TE.get());
8297 if (std::optional<OrdersType> CurrentOrder =
8299 Queue.push(TE.get());
8300 if (!(TE->State == TreeEntry::Vectorize ||
8301 TE->State == TreeEntry::StridedVectorize ||
8302 TE->State == TreeEntry::CompressVectorize ||
8303 TE->State == TreeEntry::SplitVectorize) ||
8304 !TE->ReuseShuffleIndices.empty())
8305 GathersToOrders.
insert(TE.get());
8314 while (!Queue.empty()) {
8316 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8317 TreeEntry *TE = Queue.top();
8318 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8321 while (!Queue.empty()) {
8323 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8328 for (TreeEntry *TE : OrderedOps) {
8329 if (!(TE->State == TreeEntry::Vectorize ||
8330 TE->State == TreeEntry::StridedVectorize ||
8331 TE->State == TreeEntry::CompressVectorize ||
8332 TE->State == TreeEntry::SplitVectorize ||
8333 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8334 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8335 !Visited.
insert(TE).second)
8339 Users.first = TE->UserTreeIndex.UserTE;
8340 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8344 if (
Data.first->State == TreeEntry::SplitVectorize) {
8346 Data.second.size() <= 2 &&
8347 "Expected not greater than 2 operands for split vectorize node.");
8349 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8352 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8353 "Expected exactly 2 entries.");
8354 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8355 TreeEntry &OpTE = *VectorizableTree[
P.first];
8357 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8358 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8360 const auto BestOrder =
8369 const unsigned E = Order.
size();
8372 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8374 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8376 if (!OpTE.ReorderIndices.empty()) {
8377 OpTE.ReorderIndices.clear();
8378 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8381 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8385 if (
Data.first->ReuseShuffleIndices.empty() &&
8386 !
Data.first->ReorderIndices.empty()) {
8389 Queue.push(
Data.first);
8395 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8407 for (
const auto &
Op :
Data.second) {
8408 TreeEntry *OpTE =
Op.second;
8409 if (!VisitedOps.
insert(OpTE).second)
8411 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8413 const auto Order = [&]() ->
const OrdersType {
8414 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8418 return OpTE->ReorderIndices;
8422 if (Order.
size() == 1)
8428 Value *Root = OpTE->hasState()
8431 auto GetSameNodesUsers = [&](
Value *Root) {
8433 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8434 if (TE != OpTE && TE->UserTreeIndex &&
8435 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8436 TE->Scalars.size() == OpTE->Scalars.size() &&
8437 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8438 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8439 Res.
insert(TE->UserTreeIndex.UserTE);
8441 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8442 if (TE != OpTE && TE->UserTreeIndex &&
8443 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8444 TE->Scalars.size() == OpTE->Scalars.size() &&
8445 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8446 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8447 Res.
insert(TE->UserTreeIndex.UserTE);
8451 auto GetNumOperands = [](
const TreeEntry *TE) {
8452 if (TE->State == TreeEntry::SplitVectorize)
8453 return TE->getNumOperands();
8455 return CI->arg_size();
8456 return TE->getNumOperands();
8458 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8459 const TreeEntry *TE) {
8467 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8468 if (
Op->isGather() &&
Op->hasState()) {
8469 const TreeEntry *VecOp =
8470 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8474 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8481 if (!RevisitedOps.
insert(UTE).second)
8483 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8484 !UTE->ReuseShuffleIndices.empty() ||
8485 (UTE->UserTreeIndex &&
8486 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8487 (
Data.first->UserTreeIndex &&
8488 Data.first->UserTreeIndex.UserTE == UTE) ||
8489 (IgnoreReorder && UTE->UserTreeIndex &&
8490 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8491 NodeShouldBeReorderedWithOperands(UTE);
8494 for (TreeEntry *UTE :
Users) {
8502 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8504 Queue.push(
const_cast<TreeEntry *
>(
Op));
8509 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8510 return P.second == OpTE;
8513 if (OpTE->State == TreeEntry::Vectorize &&
8514 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8515 assert(!OpTE->isAltShuffle() &&
8516 "Alternate instructions are only supported by BinaryOperator "
8520 unsigned E = Order.
size();
8523 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8526 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
8528 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
8530 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8531 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8532 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8533 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8534 (IgnoreReorder && TE->Idx == 0))
8536 if (TE->isGather()) {
8546 if (OpTE->UserTreeIndex) {
8547 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8548 if (!VisitedUsers.
insert(UserTE).second)
8553 if (AllowsReordering(UserTE))
8561 if (
static_cast<unsigned>(
count_if(
8562 Ops, [UserTE, &AllowsReordering](
8563 const std::pair<unsigned, TreeEntry *> &
Op) {
8564 return AllowsReordering(
Op.second) &&
8565 Op.second->UserTreeIndex.UserTE == UserTE;
8566 })) <=
Ops.size() / 2)
8567 ++Res.first->second;
8570 if (OrdersUses.empty()) {
8575 unsigned IdentityCnt = 0;
8576 unsigned VF =
Data.second.front().second->getVectorFactor();
8578 for (
auto &Pair : OrdersUses) {
8580 IdentityCnt += Pair.second;
8585 unsigned Cnt = IdentityCnt;
8586 for (
auto &Pair : OrdersUses) {
8590 if (Cnt < Pair.second) {
8592 BestOrder = Pair.first;
8609 unsigned E = BestOrder.
size();
8611 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8613 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8614 TreeEntry *TE =
Op.second;
8615 if (!VisitedOps.
insert(TE).second)
8617 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8618 reorderNodeWithReuses(*TE, Mask);
8622 if (TE->State != TreeEntry::Vectorize &&
8623 TE->State != TreeEntry::StridedVectorize &&
8624 TE->State != TreeEntry::CompressVectorize &&
8625 TE->State != TreeEntry::SplitVectorize &&
8626 (TE->State != TreeEntry::ScatterVectorize ||
8627 TE->ReorderIndices.empty()))
8629 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8630 TE->ReorderIndices.empty()) &&
8631 "Non-matching sizes of user/operand entries.");
8633 if (IgnoreReorder && TE == VectorizableTree.front().get())
8634 IgnoreReorder =
false;
8637 for (TreeEntry *
Gather : GatherOps) {
8639 "Unexpected reordering of gathers.");
8640 if (!
Gather->ReuseShuffleIndices.empty()) {
8650 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8651 return TE.isAltShuffle() &&
8652 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8653 TE.ReorderIndices.empty());
8655 if (
Data.first->State != TreeEntry::Vectorize ||
8657 Data.first->getMainOp()) ||
8658 IsNotProfitableAltCodeNode(*
Data.first))
8659 Data.first->reorderOperands(Mask);
8661 IsNotProfitableAltCodeNode(*
Data.first) ||
8662 Data.first->State == TreeEntry::StridedVectorize ||
8663 Data.first->State == TreeEntry::CompressVectorize) {
8667 if (
Data.first->ReuseShuffleIndices.empty() &&
8668 !
Data.first->ReorderIndices.empty() &&
8669 !IsNotProfitableAltCodeNode(*
Data.first)) {
8672 Queue.push(
Data.first);
8680 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8681 VectorizableTree.front()->ReuseShuffleIndices.empty())
8682 VectorizableTree.front()->ReorderIndices.
clear();
8685Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8686 if (Entry.hasState() &&
8687 (Entry.getOpcode() == Instruction::Store ||
8688 Entry.getOpcode() == Instruction::Load) &&
8689 Entry.State == TreeEntry::StridedVectorize &&
8690 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8697 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8701 for (
auto &TEPtr : VectorizableTree) {
8702 TreeEntry *Entry = TEPtr.get();
8705 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8709 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8710 Value *Scalar = Entry->Scalars[Lane];
8715 auto It = ScalarToExtUses.
find(Scalar);
8716 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8719 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8720 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8721 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8722 <<
" from " << *Scalar <<
"for many users.\n");
8723 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8724 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8725 ExternalUsesWithNonUsers.insert(Scalar);
8730 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8731 if (ExtI != ExternallyUsedValues.
end()) {
8732 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8733 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8734 << FoundLane <<
" from " << *Scalar <<
".\n");
8735 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8736 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8747 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8752 !UseEntries.
empty()) {
8756 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8759 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8760 return UseEntry->State == TreeEntry::ScatterVectorize ||
8762 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8765 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
8768 [](TreeEntry *UseEntry) {
8769 return UseEntry->isGather();
8775 if (It != ScalarToExtUses.
end()) {
8776 ExternalUses[It->second].User =
nullptr;
8781 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
8783 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8785 <<
" from lane " << FoundLane <<
" from " << *Scalar
8787 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8788 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8789 ExternalUsesWithNonUsers.insert(Scalar);
8798BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
8802 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
8803 Value *V = TE->Scalars[Lane];
8816 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
8825 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
8826 SI->getValueOperand()->getType(),
Ptr}];
8829 if (StoresVec.size() > Lane)
8831 if (!StoresVec.empty()) {
8833 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
8834 SI->getValueOperand()->getType(),
8835 StoresVec.front()->getPointerOperand(), *
DL, *SE,
8841 StoresVec.push_back(
SI);
8846 for (
auto &
P : PtrToStoresMap) {
8861 StoreInst *S0 = StoresVec[0];
8866 StoreInst *
SI = StoresVec[Idx];
8867 std::optional<int64_t> Diff =
8869 SI->getPointerOperand(), *DL, *SE,
8875 if (StoreOffsetVec.
size() != StoresVec.
size())
8877 sort(StoreOffsetVec, llvm::less_first());
8879 int64_t PrevDist = 0;
8880 for (
const auto &
P : StoreOffsetVec) {
8881 if (Idx > 0 &&
P.first != PrevDist + 1)
8889 ReorderIndices.assign(StoresVec.
size(), 0);
8890 bool IsIdentity =
true;
8892 ReorderIndices[
P.second] =
I;
8893 IsIdentity &=
P.second ==
I;
8899 ReorderIndices.clear();
8906 for (
unsigned Idx : Order)
8907 dbgs() << Idx <<
", ";
8913BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
8914 unsigned NumLanes =
TE->Scalars.size();
8927 if (StoresVec.
size() != NumLanes)
8932 if (!canFormVector(StoresVec, ReorderIndices))
8937 ExternalReorderIndices.
push_back(ReorderIndices);
8939 return ExternalReorderIndices;
8945 UserIgnoreList = &UserIgnoreLst;
8948 buildTreeRec(Roots, 0,
EdgeInfo());
8955 buildTreeRec(Roots, 0,
EdgeInfo());
8964 bool AddNew =
true) {
8972 for (
Value *V : VL) {
8976 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
8978 bool IsFound =
false;
8979 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
8980 assert(LI->getParent() ==
Data.front().first->getParent() &&
8981 LI->getType() ==
Data.front().first->getType() &&
8985 "Expected loads with the same type, same parent and same "
8986 "underlying pointer.");
8988 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
8989 Data.front().first->getPointerOperand(),
DL, SE,
8993 auto It = Map.find(*Dist);
8994 if (It != Map.end() && It->second != LI)
8996 if (It == Map.end()) {
8997 Data.emplace_back(LI, *Dist);
8998 Map.try_emplace(*Dist, LI);
9008 auto FindMatchingLoads =
9013 int64_t &
Offset,
unsigned &Start) {
9015 return GatheredLoads.
end();
9024 std::optional<int64_t> Dist =
9026 Data.front().first->getType(),
9027 Data.front().first->getPointerOperand(),
DL, SE,
9033 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9039 unsigned NumUniques = 0;
9040 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9041 bool Used = DataLoads.
contains(Pair.first);
9042 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9046 Repeated.insert(Cnt);
9049 if (NumUniques > 0 &&
9050 (Loads.
size() == NumUniques ||
9051 (Loads.
size() - NumUniques >= 2 &&
9052 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9058 return std::next(GatheredLoads.
begin(), Idx);
9062 return GatheredLoads.
end();
9064 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9068 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9070 while (It != GatheredLoads.
end()) {
9071 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9072 for (
unsigned Idx : LocalToAdd)
9075 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9079 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9086 Loads.push_back(
Data[Idx]);
9092 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9093 return PD.front().first->getParent() == LI->
getParent() &&
9094 PD.front().first->getType() == LI->
getType();
9096 while (It != GatheredLoads.
end()) {
9099 std::next(It), GatheredLoads.
end(),
9100 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9101 return PD.front().first->getParent() == LI->getParent() &&
9102 PD.front().first->getType() == LI->getType();
9106 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9107 AddNewLoads(GatheredLoads.emplace_back());
9112void BoUpSLP::tryToVectorizeGatheredLoads(
9113 const SmallMapVector<
9114 std::tuple<BasicBlock *, Value *, Type *>,
9117 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9120 LoadEntriesToVectorize.size());
9121 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9122 Set.insert_range(VectorizableTree[Idx]->Scalars);
9125 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9126 const std::pair<LoadInst *, int64_t> &L2) {
9127 return L1.second > L2.second;
9134 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9135 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9136 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9141 SmallVectorImpl<LoadInst *> &NonVectorized,
9142 bool Final,
unsigned MaxVF) {
9144 unsigned StartIdx = 0;
9145 SmallVector<int> CandidateVFs;
9149 *TTI, Loads.
front()->getType(), MaxVF);
9151 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9157 if (Final && CandidateVFs.
empty())
9160 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9161 for (
unsigned NumElts : CandidateVFs) {
9162 if (Final && NumElts > BestVF)
9164 SmallVector<unsigned> MaskedGatherVectorized;
9165 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9169 if (VectorizedLoads.count(Slice.
front()) ||
9170 VectorizedLoads.count(Slice.
back()) ||
9176 bool AllowToVectorize =
false;
9179 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9182 for (LoadInst *LI : Slice) {
9184 if (LI->hasOneUse())
9190 if (
static_cast<unsigned int>(std::distance(
9191 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9193 if (!IsLegalBroadcastLoad)
9197 for (User *U : LI->users()) {
9200 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9201 for (
int I :
seq<int>(UTE->getNumOperands())) {
9203 return V == LI || isa<PoisonValue>(V);
9213 AllowToVectorize = CheckIfAllowed(Slice);
9217 any_of(ValueToGatherNodes.at(Slice.front()),
9218 [=](
const TreeEntry *TE) {
9219 return TE->Scalars.size() == 2 &&
9220 ((TE->Scalars.front() == Slice.front() &&
9221 TE->Scalars.back() == Slice.back()) ||
9222 (TE->Scalars.front() == Slice.back() &&
9223 TE->Scalars.back() == Slice.front()));
9228 if (AllowToVectorize) {
9233 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9234 StridedPtrInfo SPtrInfo;
9236 PointerOps, SPtrInfo, &BestVF);
9238 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9240 if (MaskedGatherVectorized.
empty() ||
9241 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9246 Results.emplace_back(Values, LS);
9247 VectorizedLoads.insert_range(Slice);
9250 if (Cnt == StartIdx)
9251 StartIdx += NumElts;
9254 if (StartIdx >= Loads.
size())
9258 if (!MaskedGatherVectorized.
empty() &&
9259 Cnt < MaskedGatherVectorized.
back() + NumElts)
9265 if (!AllowToVectorize || BestVF == 0)
9269 for (
unsigned Cnt : MaskedGatherVectorized) {
9271 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9275 VectorizedLoads.insert_range(Slice);
9277 if (Cnt == StartIdx)
9278 StartIdx += NumElts;
9281 for (LoadInst *LI : Loads) {
9282 if (!VectorizedLoads.contains(LI))
9283 NonVectorized.push_back(LI);
9287 auto ProcessGatheredLoads =
9290 bool Final =
false) {
9292 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9294 if (LoadsDists.size() <= 1) {
9295 NonVectorized.
push_back(LoadsDists.back().first);
9303 unsigned MaxConsecutiveDistance = 0;
9304 unsigned CurrentConsecutiveDist = 1;
9305 int64_t LastDist = LocalLoadsDists.front().second;
9306 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9307 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9310 assert(LastDist >=
L.second &&
9311 "Expected first distance always not less than second");
9312 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9313 CurrentConsecutiveDist) {
9314 ++CurrentConsecutiveDist;
9315 MaxConsecutiveDistance =
9316 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9320 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9323 CurrentConsecutiveDist = 1;
9324 LastDist =
L.second;
9327 if (Loads.
size() <= 1)
9329 if (AllowMaskedGather)
9330 MaxConsecutiveDistance = Loads.
size();
9331 else if (MaxConsecutiveDistance < 2)
9336 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9337 Final, MaxConsecutiveDistance);
9339 OriginalLoads.size() == Loads.
size() &&
9340 MaxConsecutiveDistance == Loads.
size() &&
9345 VectorizedLoads.
clear();
9349 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9350 UnsortedNonVectorized, Final,
9351 OriginalLoads.size());
9352 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9353 SortedNonVectorized.
swap(UnsortedNonVectorized);
9354 Results.swap(UnsortedResults);
9359 << Slice.
size() <<
")\n");
9361 for (
Value *L : Slice)
9369 unsigned MaxVF = Slice.size();
9370 unsigned UserMaxVF = 0;
9371 unsigned InterleaveFactor = 0;
9376 std::optional<unsigned> InterleavedLoadsDistance = 0;
9378 std::optional<unsigned> CommonVF = 0;
9379 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9380 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9381 for (
auto [Idx, V] :
enumerate(Slice)) {
9382 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9383 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9386 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9388 if (*CommonVF == 0) {
9389 CommonVF =
E->Scalars.size();
9392 if (*CommonVF !=
E->Scalars.size())
9396 if (Pos != Idx && InterleavedLoadsDistance) {
9399 if (isa<Constant>(V))
9401 if (isVectorized(V))
9403 const auto &Nodes = ValueToGatherNodes.at(V);
9404 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9405 !is_contained(Slice, V);
9407 InterleavedLoadsDistance.reset();
9411 if (*InterleavedLoadsDistance == 0) {
9412 InterleavedLoadsDistance = Idx - Pos;
9415 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9416 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9417 InterleavedLoadsDistance.reset();
9418 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9422 DeinterleavedNodes.
clear();
9424 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9425 CommonVF.value_or(0) != 0) {
9426 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9427 unsigned VF = *CommonVF;
9430 StridedPtrInfo SPtrInfo;
9432 if (InterleaveFactor <= Slice.size() &&
9433 TTI.isLegalInterleavedAccessType(
9441 UserMaxVF = InterleaveFactor * VF;
9443 InterleaveFactor = 0;
9448 unsigned ConsecutiveNodesSize = 0;
9449 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9450 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9451 [&, Slice = Slice](
const auto &
P) {
9453 return std::get<1>(
P).contains(V);
9455 if (It == Slice.end())
9457 const TreeEntry &
TE =
9458 *VectorizableTree[std::get<0>(
P)];
9462 StridedPtrInfo SPtrInfo;
9464 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9468 ConsecutiveNodesSize += VL.
size();
9469 size_t Start = std::distance(Slice.begin(), It);
9470 size_t Sz = Slice.size() -
Start;
9471 return Sz < VL.
size() ||
9472 Slice.slice(Start, VL.
size()) != VL;
9477 if (InterleaveFactor == 0 &&
9479 [&, Slice = Slice](
unsigned Idx) {
9481 SmallVector<Value *> PointerOps;
9482 StridedPtrInfo SPtrInfo;
9483 return canVectorizeLoads(
9484 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9485 Slice[Idx * UserMaxVF], Order, PointerOps,
9486 SPtrInfo) == LoadsState::ScatterVectorize;
9489 if (Slice.size() != ConsecutiveNodesSize)
9490 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9492 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9493 bool IsVectorized =
true;
9494 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
9496 Slice.slice(
I, std::min(VF,
E -
I));
9501 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9502 [&](
const auto &
P) {
9504 VectorizableTree[std::get<0>(
P)]
9509 unsigned Sz = VectorizableTree.size();
9510 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
9511 if (Sz == VectorizableTree.size()) {
9512 IsVectorized =
false;
9515 if (InterleaveFactor > 0) {
9516 VF = 2 * (MaxVF / InterleaveFactor);
9517 InterleaveFactor = 0;
9526 NonVectorized.
append(SortedNonVectorized);
9528 return NonVectorized;
9530 for (
const auto &GLs : GatheredLoads) {
9531 const auto &
Ref = GLs.second;
9533 if (!
Ref.empty() && !NonVectorized.
empty() &&
9535 Ref.begin(),
Ref.end(), 0u,
9536 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9537 ->
unsigned { return S + LoadsDists.size(); }) !=
9538 NonVectorized.
size() &&
9539 IsMaskedGatherSupported(NonVectorized)) {
9542 for (LoadInst *LI : NonVectorized) {
9550 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9554 for (
unsigned Idx : LoadEntriesToVectorize) {
9555 const TreeEntry &
E = *VectorizableTree[Idx];
9558 if (!
E.ReorderIndices.empty()) {
9561 SmallVector<int> ReorderMask;
9565 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
9569 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9570 VectorizableTree.size())
9571 GatheredLoadsEntriesFirst.reset();
9581 bool AllowAlternate) {
9604 isValidForAlternation(
I->getOpcode())) {
9616 std::pair<size_t, size_t> OpVals =
9624 if (CI->isCommutative())
9646 SubKey =
hash_value(Gep->getPointerOperand());
9658 return std::make_pair(
Key, SubKey);
9664 Instruction *AltOp,
const TargetLibraryInfo &TLI);
9666bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9668 Type *ScalarTy = S.getMainOp()->getType();
9669 unsigned Opcode0 = S.getOpcode();
9670 unsigned Opcode1 = S.getAltOpcode();
9671 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9674 Opcode1, OpcodeMask))
9677 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
9680 for (
Value *V : VL) {
9697 switch (Res.value_or(0)) {
9711 DenseSet<unsigned> UniqueOpcodes;
9712 constexpr unsigned NumAltInsts = 3;
9713 unsigned NonInstCnt = 0;
9716 unsigned UndefCnt = 0;
9718 unsigned ExtraShuffleInsts = 0;
9727 return is_contained(Operands.back(), V);
9730 ++ExtraShuffleInsts;
9733 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
9745 DenseMap<Value *, unsigned> Uniques;
9755 if (!Res.second && Res.first->second == 1)
9756 ++ExtraShuffleInsts;
9757 ++Res.first->getSecond();
9759 UniqueOpcodes.
insert(
I->getOpcode());
9760 else if (Res.second)
9763 return none_of(Uniques, [&](
const auto &
P) {
9764 return P.first->hasNUsesOrMore(
P.second + 1) &&
9765 none_of(
P.first->users(), [&](User *U) {
9766 return isVectorized(U) || Uniques.contains(U);
9775 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9776 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
9777 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9784 const unsigned VF,
unsigned MinBW,
9807static std::pair<InstructionCost, InstructionCost>
9827 FMF = FPCI->getFastMathFlags();
9830 LibCost.isValid() ? LibCost : ScalarLimit);
9840BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9842 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
9843 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9845 "Expected instructions with same/alternate opcodes only.");
9847 unsigned ShuffleOrOp =
9848 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
9850 switch (ShuffleOrOp) {
9851 case Instruction::PHI: {
9854 return TreeEntry::NeedToGather;
9856 for (
Value *V : VL) {
9860 for (
Value *Incoming :
PHI->incoming_values()) {
9862 if (Term &&
Term->isTerminator()) {
9864 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
9865 return TreeEntry::NeedToGather;
9870 return TreeEntry::Vectorize;
9872 case Instruction::ExtractElement:
9879 return TreeEntry::NeedToGather;
9881 case Instruction::ExtractValue: {
9882 bool Reuse = canReuseExtract(VL, CurrentOrder);
9886 return TreeEntry::NeedToGather;
9887 if (Reuse || !CurrentOrder.empty())
9888 return TreeEntry::Vectorize;
9890 return TreeEntry::NeedToGather;
9892 case Instruction::InsertElement: {
9896 for (
Value *V : VL) {
9898 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
9899 return TreeEntry::NeedToGather;
9903 "Non-constant or undef index?");
9907 return !SourceVectors.contains(V);
9910 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9911 "different source vectors.\n");
9912 return TreeEntry::NeedToGather;
9917 return SourceVectors.contains(V) && !
V->hasOneUse();
9920 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
9921 "multiple uses.\n");
9922 return TreeEntry::NeedToGather;
9925 return TreeEntry::Vectorize;
9927 case Instruction::Load: {
9934 auto IsGatheredNode = [&]() {
9935 if (!GatheredLoadsEntriesFirst)
9940 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
9941 return TE->Idx >= *GatheredLoadsEntriesFirst;
9947 return TreeEntry::Vectorize;
9949 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9951 LoadEntriesToVectorize.insert(VectorizableTree.size());
9952 return TreeEntry::NeedToGather;
9954 return IsGatheredNode() ? TreeEntry::NeedToGather
9955 : TreeEntry::CompressVectorize;
9957 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9959 LoadEntriesToVectorize.insert(VectorizableTree.size());
9960 return TreeEntry::NeedToGather;
9962 return IsGatheredNode() ? TreeEntry::NeedToGather
9963 : TreeEntry::ScatterVectorize;
9965 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9967 LoadEntriesToVectorize.insert(VectorizableTree.size());
9968 return TreeEntry::NeedToGather;
9970 return IsGatheredNode() ? TreeEntry::NeedToGather
9971 : TreeEntry::StridedVectorize;
9975 if (DL->getTypeSizeInBits(ScalarTy) !=
9976 DL->getTypeAllocSizeInBits(ScalarTy))
9977 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
9980 return !LI || !LI->isSimple();
9987 return TreeEntry::NeedToGather;
9991 case Instruction::ZExt:
9992 case Instruction::SExt:
9993 case Instruction::FPToUI:
9994 case Instruction::FPToSI:
9995 case Instruction::FPExt:
9996 case Instruction::PtrToInt:
9997 case Instruction::IntToPtr:
9998 case Instruction::SIToFP:
9999 case Instruction::UIToFP:
10000 case Instruction::Trunc:
10001 case Instruction::FPTrunc:
10002 case Instruction::BitCast: {
10004 for (
Value *V : VL) {
10010 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10011 return TreeEntry::NeedToGather;
10014 return TreeEntry::Vectorize;
10016 case Instruction::ICmp:
10017 case Instruction::FCmp: {
10022 for (
Value *V : VL) {
10026 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10027 Cmp->getOperand(0)->getType() != ComparedTy) {
10028 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10029 return TreeEntry::NeedToGather;
10032 return TreeEntry::Vectorize;
10034 case Instruction::Select:
10035 case Instruction::FNeg:
10036 case Instruction::Add:
10037 case Instruction::FAdd:
10038 case Instruction::Sub:
10039 case Instruction::FSub:
10040 case Instruction::Mul:
10041 case Instruction::FMul:
10042 case Instruction::UDiv:
10043 case Instruction::SDiv:
10044 case Instruction::FDiv:
10045 case Instruction::URem:
10046 case Instruction::SRem:
10047 case Instruction::FRem:
10048 case Instruction::Shl:
10049 case Instruction::LShr:
10050 case Instruction::AShr:
10051 case Instruction::And:
10052 case Instruction::Or:
10053 case Instruction::Xor:
10054 case Instruction::Freeze:
10055 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10056 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10058 return I &&
I->isBinaryOp() && !
I->isFast();
10060 return TreeEntry::NeedToGather;
10061 return TreeEntry::Vectorize;
10062 case Instruction::GetElementPtr: {
10064 for (
Value *V : VL) {
10068 if (
I->getNumOperands() != 2) {
10069 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10070 return TreeEntry::NeedToGather;
10077 for (
Value *V : VL) {
10081 Type *CurTy =
GEP->getSourceElementType();
10082 if (Ty0 != CurTy) {
10083 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10084 return TreeEntry::NeedToGather;
10090 for (
Value *V : VL) {
10094 auto *
Op =
I->getOperand(1);
10096 (
Op->getType() != Ty1 &&
10098 Op->getType()->getScalarSizeInBits() >
10099 DL->getIndexSizeInBits(
10100 V->getType()->getPointerAddressSpace())))) {
10102 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10103 return TreeEntry::NeedToGather;
10107 return TreeEntry::Vectorize;
10109 case Instruction::Store: {
10111 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10114 if (DL->getTypeSizeInBits(ScalarTy) !=
10115 DL->getTypeAllocSizeInBits(ScalarTy)) {
10116 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10117 return TreeEntry::NeedToGather;
10121 for (
Value *V : VL) {
10123 if (!
SI->isSimple()) {
10125 return TreeEntry::NeedToGather;
10134 if (CurrentOrder.empty()) {
10135 Ptr0 = PointerOps.
front();
10136 PtrN = PointerOps.
back();
10138 Ptr0 = PointerOps[CurrentOrder.front()];
10139 PtrN = PointerOps[CurrentOrder.back()];
10141 std::optional<int64_t> Dist =
10144 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10145 return TreeEntry::Vectorize;
10149 return TreeEntry::NeedToGather;
10151 case Instruction::Call: {
10152 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10153 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10155 return I && !
I->isFast();
10157 return TreeEntry::NeedToGather;
10167 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10171 return TreeEntry::NeedToGather;
10174 unsigned NumArgs = CI->
arg_size();
10176 for (
unsigned J = 0; J != NumArgs; ++J)
10179 for (
Value *V : VL) {
10184 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10186 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10188 return TreeEntry::NeedToGather;
10192 for (
unsigned J = 0; J != NumArgs; ++J) {
10195 if (ScalarArgs[J] != A1J) {
10197 <<
"SLP: mismatched arguments in call:" << *CI
10198 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10199 return TreeEntry::NeedToGather;
10208 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10209 <<
"!=" << *V <<
'\n');
10210 return TreeEntry::NeedToGather;
10215 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10217 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10218 return TreeEntry::NeedToGather;
10220 return TreeEntry::Vectorize;
10222 case Instruction::ShuffleVector: {
10223 if (!S.isAltShuffle()) {
10226 return TreeEntry::Vectorize;
10229 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10230 return TreeEntry::NeedToGather;
10235 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10236 "the whole alt sequence is not profitable.\n");
10237 return TreeEntry::NeedToGather;
10240 return TreeEntry::Vectorize;
10244 return TreeEntry::NeedToGather;
10253 PHINode *Main =
nullptr;
10258 PHIHandler() =
delete;
10260 : DT(DT), Main(Main), Phis(Phis),
10261 Operands(Main->getNumIncomingValues(),
10263 void buildOperands() {
10264 constexpr unsigned FastLimit = 4;
10273 for (
auto [Idx, V] :
enumerate(Phis)) {
10277 "Expected isa instruction or poison value.");
10281 if (
P->getIncomingBlock(
I) == InBB)
10284 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10289 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10299 for (
auto [Idx, V] :
enumerate(Phis)) {
10314 auto *It = Blocks.
find(InBB);
10315 if (It == Blocks.
end())
10317 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10320 for (
const auto &
P : Blocks) {
10321 ArrayRef<unsigned> IncomingValues =
P.second;
10322 if (IncomingValues.
size() <= 1)
10325 for (
unsigned I : IncomingValues) {
10327 [&](
const auto &
Data) {
10328 return !
Data.value() ||
10331 "Expected empty operands list.");
10345static std::pair<Instruction *, Instruction *>
10349 for (
Value *V : VL) {
10359 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10378 "Expected different main and alt instructions.");
10379 return std::make_pair(MainOp, AltOp);
10392 const InstructionsState &S,
10394 bool TryPad =
false) {
10398 for (
Value *V : VL) {
10414 size_t NumUniqueScalarValues = UniqueValues.
size();
10417 if (NumUniqueScalarValues == VL.
size() &&
10419 ReuseShuffleIndices.
clear();
10424 if ((UserTreeIdx.
UserTE &&
10425 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10427 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10428 "for nodes with padding.\n");
10429 ReuseShuffleIndices.
clear();
10434 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10438 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10439 S.getMainOp()->isSafeToRemove() &&
10440 (S.areInstructionsWithCopyableElements() ||
10444 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10445 PWSz = std::min<unsigned>(PWSz, VL.
size());
10446 if (PWSz == VL.
size()) {
10450 ReuseShuffleIndices.
clear();
10454 UniqueValues.
end());
10455 PaddedUniqueValues.
append(
10456 PWSz - UniqueValues.
size(),
10460 if (!S.areInstructionsWithCopyableElements() &&
10463 ReuseShuffleIndices.
clear();
10466 VL = std::move(PaddedUniqueValues);
10471 ReuseShuffleIndices.
clear();
10474 VL = std::move(UniqueValues);
10479 const InstructionsState &LocalState,
10480 SmallVectorImpl<Value *> &Op1,
10481 SmallVectorImpl<Value *> &Op2,
10483 constexpr unsigned SmallNodeSize = 4;
10484 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10489 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10491 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
10492 if (
E->isSame(VL)) {
10494 << *LocalState.getMainOp() <<
".\n");
10506 ReorderIndices.assign(VL.
size(), VL.
size());
10507 SmallBitVector Op1Indices(VL.
size());
10512 Op1Indices.set(Idx);
10515 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10518 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10520 LocalState.getAltOp(), *TLI))) {
10522 Op1Indices.set(Idx);
10529 unsigned Opcode0 = LocalState.getOpcode();
10530 unsigned Opcode1 = LocalState.getAltOpcode();
10531 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10536 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10537 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10542 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10544 if (Op1Indices.test(Idx)) {
10545 ReorderIndices[Op1Cnt] = Idx;
10548 ReorderIndices[Op2Cnt] = Idx;
10553 ReorderIndices.clear();
10554 SmallVector<int>
Mask;
10555 if (!ReorderIndices.empty())
10557 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10562 if (NumParts >= VL.
size())
10567 FixedVectorType *SubVecTy =
10571 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10572 (
Mask.empty() || InsertCost >= NewShuffleCost))
10574 if ((LocalState.getMainOp()->isBinaryOp() &&
10575 LocalState.getAltOp()->isBinaryOp() &&
10576 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10577 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10578 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10579 (LocalState.getMainOp()->isUnaryOp() &&
10580 LocalState.getAltOp()->isUnaryOp())) {
10582 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10583 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10588 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
10592 VecTy, OriginalMask, Kind);
10594 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10595 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10597 NewVecOpsCost + InsertCost +
10598 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10599 VectorizableTree.front()->getOpcode() == Instruction::Store
10603 if (NewCost >= OriginalCost)
10613class InstructionsCompatibilityAnalysis {
10615 const DataLayout &
DL;
10616 const TargetTransformInfo &
TTI;
10617 const TargetLibraryInfo &TLI;
10618 unsigned MainOpcode = 0;
10623 static bool isSupportedOpcode(
const unsigned Opcode) {
10624 return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10635 return I && isSupportedOpcode(
I->getOpcode()) &&
10640 SmallDenseSet<Value *, 8>
Operands;
10641 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10642 for (
Value *V : VL) {
10648 if (Candidates.
empty()) {
10649 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10651 Operands.insert(
I->op_begin(),
I->op_end());
10654 if (Parent ==
I->getParent()) {
10655 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10656 Operands.insert(
I->op_begin(),
I->op_end());
10659 auto *NodeA = DT.
getNode(Parent);
10660 auto *NodeB = DT.
getNode(
I->getParent());
10661 assert(NodeA &&
"Should only process reachable instructions");
10662 assert(NodeB &&
"Should only process reachable instructions");
10663 assert((NodeA == NodeB) ==
10664 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10665 "Different nodes should have different DFS numbers");
10666 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10667 Candidates.
clear();
10668 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10671 Operands.insert(
I->op_begin(),
I->op_end());
10674 unsigned BestOpcodeNum = 0;
10676 for (
const auto &
P : Candidates) {
10677 if (
P.second.size() < BestOpcodeNum)
10679 for (Instruction *
I :
P.second) {
10680 if (IsSupportedInstruction(
I) && !
Operands.contains(
I)) {
10682 BestOpcodeNum =
P.second.size();
10692 return I &&
I->getParent() == MainOp->
getParent() &&
10705 Value *selectBestIdempotentValue()
const {
10706 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10717 if (!S.isCopyableElement(V))
10719 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10720 return {
V, selectBestIdempotentValue()};
10726 SmallVectorImpl<BoUpSLP::ValueList> &
Operands)
const {
10728 unsigned ShuffleOrOp =
10729 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10732 switch (ShuffleOrOp) {
10733 case Instruction::PHI: {
10737 PHIHandler Handler(DT, PH, VL);
10738 Handler.buildOperands();
10739 Operands.assign(PH->getNumOperands(), {});
10741 Operands[
I].assign(Handler.getOperands(
I).begin(),
10742 Handler.getOperands(
I).end());
10745 case Instruction::ExtractValue:
10746 case Instruction::ExtractElement:
10751 case Instruction::InsertElement:
10759 case Instruction::Load:
10767 Op = LI->getPointerOperand();
10770 case Instruction::ZExt:
10771 case Instruction::SExt:
10772 case Instruction::FPToUI:
10773 case Instruction::FPToSI:
10774 case Instruction::FPExt:
10775 case Instruction::PtrToInt:
10776 case Instruction::IntToPtr:
10777 case Instruction::SIToFP:
10778 case Instruction::UIToFP:
10779 case Instruction::Trunc:
10780 case Instruction::FPTrunc:
10781 case Instruction::BitCast:
10782 case Instruction::ICmp:
10783 case Instruction::FCmp:
10784 case Instruction::Select:
10785 case Instruction::FNeg:
10786 case Instruction::Add:
10787 case Instruction::FAdd:
10788 case Instruction::Sub:
10789 case Instruction::FSub:
10790 case Instruction::Mul:
10791 case Instruction::FMul:
10792 case Instruction::UDiv:
10793 case Instruction::SDiv:
10794 case Instruction::FDiv:
10795 case Instruction::URem:
10796 case Instruction::SRem:
10797 case Instruction::FRem:
10798 case Instruction::Shl:
10799 case Instruction::LShr:
10800 case Instruction::AShr:
10801 case Instruction::And:
10802 case Instruction::Or:
10803 case Instruction::Xor:
10804 case Instruction::Freeze:
10805 case Instruction::Store:
10806 case Instruction::ShuffleVector:
10815 auto [
Op, ConvertedOps] = convertTo(
I, S);
10820 case Instruction::GetElementPtr: {
10827 const unsigned IndexIdx = 1;
10833 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
10836 :
DL.getIndexType(
cast<GetElementPtrInst>(VL0)
10837 ->getPointerOperandType()
10838 ->getScalarType());
10843 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10847 auto *
Op =
GEP->getOperand(IndexIdx);
10850 CI, Ty, CI->getValue().isSignBitSet(),
DL)
10855 case Instruction::Call: {
10862 for (
Value *V : VL) {
10864 Ops.push_back(
I ?
I->getOperand(Idx)
10877 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
10878 const TargetTransformInfo &
TTI,
10879 const TargetLibraryInfo &TLI)
10884 bool TryCopyableElementsVectorization,
10885 bool WithProfitabilityCheck =
false,
10886 bool SkipSameCodeCheck =
false) {
10887 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
10888 ? InstructionsState::invalid()
10894 findAndSetMainInstruction(VL, R);
10896 return InstructionsState::invalid();
10897 S = InstructionsState(MainOp, MainOp,
true);
10898 if (!WithProfitabilityCheck)
10902 auto BuildCandidates =
10903 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
10909 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
10910 I1->getParent() != I2->getParent())
10914 if (VL.
size() == 2) {
10919 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10920 R.findBestRootPair(Candidates1) &&
10921 R.findBestRootPair(Candidates2);
10923 Candidates1.
clear();
10924 Candidates2.
clear();
10927 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
10928 R.findBestRootPair(Candidates1) &&
10929 R.findBestRootPair(Candidates2);
10932 return InstructionsState::invalid();
10936 FixedVectorType *VecTy =
10938 switch (MainOpcode) {
10939 case Instruction::Add:
10940 case Instruction::LShr:
10946 if (VectorCost > ScalarCost)
10947 return InstructionsState::invalid();
10950 assert(
Operands.size() == 2 &&
"Unexpected number of operands!");
10951 unsigned CopyableNum =
10952 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
10953 if (CopyableNum < VL.
size() / 2)
10956 const unsigned Limit = VL.
size() / 24;
10957 if ((CopyableNum >= VL.
size() - Limit ||
10958 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
10963 return InstructionsState::invalid();
10982 return InstructionsState::invalid();
10988 constexpr unsigned Limit = 4;
10989 if (
Operands.front().size() >= Limit) {
10990 SmallDenseMap<const Value *, unsigned>
Counters;
10998 return C.second == 1;
11004 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11005 InstructionsState OpS =
Analysis.buildInstructionsState(
11007 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11009 unsigned CopyableNum =
11011 return CopyableNum <= VL.
size() / 2;
11013 if (!CheckOperand(
Operands.front()))
11014 return InstructionsState::invalid();
11021 assert(S &&
"Invalid state!");
11023 if (S.areInstructionsWithCopyableElements()) {
11024 MainOp = S.getMainOp();
11025 MainOpcode = S.getOpcode();
11030 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11031 Operands[OperandIdx][Idx] = Operand;
11034 buildOriginalOperands(S, VL,
Operands);
11041BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11043 bool TryCopyableElementsVectorization)
const {
11046 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11047 InstructionsState S =
Analysis.buildInstructionsState(
11048 VL, *
this, TryCopyableElementsVectorization,
11049 true, TryCopyableElementsVectorization);
11057 return ScalarsVectorizationLegality(S,
false,
11063 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11064 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11065 if (
E->isSame(VL)) {
11066 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11068 return ScalarsVectorizationLegality(S,
false);
11073 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11074 LI->getLoopFor(S.getMainOp()->getParent()) &&
11078 return ScalarsVectorizationLegality(S,
false);
11087 !(S && !S.isAltShuffle() && VL.
size() >= 4 &&
11094 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11095 return ScalarsVectorizationLegality(S,
false);
11099 if (S && S.getOpcode() == Instruction::ExtractElement &&
11102 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11103 return ScalarsVectorizationLegality(S,
false);
11110 return ScalarsVectorizationLegality(S,
false,
11120 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11128 SmallVector<unsigned, 8> InstsCount;
11129 for (
Value *V : VL) {
11132 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11135 bool IsCommutative =
11137 if ((IsCommutative &&
11138 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11140 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11142 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11146 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11148 I2->getOperand(
Op));
11149 if (
static_cast<unsigned>(
count_if(
11150 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11152 })) >= S.getMainOp()->getNumOperands() / 2)
11154 if (S.getMainOp()->getNumOperands() > 2)
11156 if (IsCommutative) {
11158 Candidates.
clear();
11159 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11161 I2->getOperand((
Op + 1) %
E));
11163 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11170 SmallVector<unsigned> SortedIndices;
11172 bool IsScatterVectorizeUserTE =
11173 UserTreeIdx.UserTE &&
11174 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11175 bool AreAllSameBlock = S.valid();
11176 bool AreScatterAllGEPSameBlock =
11189 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11191 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11197 NotProfitableForVectorization(VL)) {
11199 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11200 "C,S,B,O, small shuffle. \n";
11204 return ScalarsVectorizationLegality(S,
false,
11208 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11212 return ScalarsVectorizationLegality(S,
false);
11216 if (S && !EphValues.empty()) {
11217 for (
Value *V : VL) {
11218 if (EphValues.count(V)) {
11220 <<
") is ephemeral.\n");
11222 return ScalarsVectorizationLegality(S,
false,
11234 if (S && S.isAltShuffle()) {
11235 auto GetNumVectorizedExtracted = [&]() {
11241 all_of(
I->operands(), [&](
const Use &U) {
11242 return isa<ExtractElementInst>(U.get());
11247 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11250 return std::make_pair(Vectorized, Extracted);
11252 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11254 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11255 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11258 Type *ScalarTy = VL.front()->getType();
11263 false,
true, Kind);
11265 *TTI, ScalarTy, VecTy, Vectorized,
11266 true,
false, Kind,
false);
11267 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11269 if (PreferScalarize) {
11270 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11271 "node is not profitable.\n");
11272 return ScalarsVectorizationLegality(S,
false);
11277 if (UserIgnoreList && !UserIgnoreList->empty()) {
11278 for (
Value *V : VL) {
11279 if (UserIgnoreList->contains(V)) {
11280 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11281 return ScalarsVectorizationLegality(S,
false);
11288 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11289 assert(VL.front()->getType()->isPointerTy() &&
11291 "Expected pointers only.");
11294 assert(It != VL.end() &&
"Expected at least one GEP.");
11305 !DT->isReachableFromEntry(BB))) {
11311 return ScalarsVectorizationLegality(S,
false);
11313 return ScalarsVectorizationLegality(S,
true);
11318 unsigned InterleaveFactor) {
11321 SmallVector<int> ReuseShuffleIndices;
11325 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11328 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11331 auto Invalid = ScheduleBundle::invalid();
11332 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11333 UserTreeIdx, {}, ReorderIndices);
11338 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11340 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11341 Idx == 0 ? 0 : Op1.
size());
11342 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11344 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11345 Idx == 0 ? 0 : Op1.
size());
11355 bool AreConsts =
false;
11356 for (
Value *V : VL) {
11368 if (AreOnlyConstsWithPHIs(VL)) {
11369 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11370 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11374 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11375 VL,
Depth, UserTreeIdx,
false);
11376 InstructionsState S = Legality.getInstructionsState();
11377 if (!Legality.isLegal()) {
11378 if (Legality.trySplitVectorize()) {
11381 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11385 Legality = getScalarsVectorizationLegality(
11386 VL,
Depth, UserTreeIdx,
true);
11387 if (!Legality.isLegal()) {
11388 if (Legality.tryToFindDuplicates())
11392 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11395 S = Legality.getInstructionsState();
11399 if (S.isAltShuffle() && TrySplitNode(S))
11405 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11410 bool IsScatterVectorizeUserTE =
11411 UserTreeIdx.UserTE &&
11412 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11415 StridedPtrInfo SPtrInfo;
11416 TreeEntry::EntryState State = getScalarsVectorizationState(
11417 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11418 if (State == TreeEntry::NeedToGather) {
11419 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11425 auto &BSRef = BlocksSchedules[BB];
11427 BSRef = std::make_unique<BlockScheduling>(BB);
11429 BlockScheduling &BS = *BSRef;
11432 std::optional<ScheduleBundle *> BundlePtr =
11433 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11434#ifdef EXPENSIVE_CHECKS
11438 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11439 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11441 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11443 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11444 NonScheduledFirst.insert(VL.front());
11445 if (S.getOpcode() == Instruction::Load &&
11446 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11450 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11452 ScheduleBundle
Empty;
11453 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11454 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11456 unsigned ShuffleOrOp =
11457 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11458 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
11460 SmallVector<unsigned> PHIOps;
11466 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11471 for (
unsigned I : PHIOps)
11474 switch (ShuffleOrOp) {
11475 case Instruction::PHI: {
11477 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11485 case Instruction::ExtractValue:
11486 case Instruction::ExtractElement: {
11487 if (CurrentOrder.empty()) {
11488 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11491 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11493 for (
unsigned Idx : CurrentOrder)
11494 dbgs() <<
" " << Idx;
11501 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11502 ReuseShuffleIndices, CurrentOrder);
11504 "(ExtractValueInst/ExtractElementInst).\n";
11511 case Instruction::InsertElement: {
11512 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11514 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11515 const std::pair<int, int> &P2) {
11516 return P1.first > P2.first;
11519 decltype(OrdCompare)>
11520 Indices(OrdCompare);
11521 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11523 Indices.emplace(Idx,
I);
11525 OrdersType CurrentOrder(VL.size(), VL.size());
11526 bool IsIdentity =
true;
11527 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11528 CurrentOrder[Indices.top().second] =
I;
11529 IsIdentity &= Indices.top().second ==
I;
11533 CurrentOrder.clear();
11534 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11536 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11540 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11543 case Instruction::Load: {
11550 TreeEntry *
TE =
nullptr;
11553 case TreeEntry::Vectorize:
11554 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11555 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11556 if (CurrentOrder.empty())
11557 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11561 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11564 case TreeEntry::CompressVectorize:
11566 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11567 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11570 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11573 case TreeEntry::StridedVectorize:
11575 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11576 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11577 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
11578 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11581 case TreeEntry::ScatterVectorize:
11583 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11584 UserTreeIdx, ReuseShuffleIndices);
11587 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11590 case TreeEntry::CombinedVectorize:
11591 case TreeEntry::SplitVectorize:
11592 case TreeEntry::NeedToGather:
11595 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11596 assert(
Operands.size() == 1 &&
"Expected a single operand only");
11597 SmallVector<int>
Mask;
11602 if (State == TreeEntry::ScatterVectorize)
11603 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11606 case Instruction::ZExt:
11607 case Instruction::SExt:
11608 case Instruction::FPToUI:
11609 case Instruction::FPToSI:
11610 case Instruction::FPExt:
11611 case Instruction::PtrToInt:
11612 case Instruction::IntToPtr:
11613 case Instruction::SIToFP:
11614 case Instruction::UIToFP:
11615 case Instruction::Trunc:
11616 case Instruction::FPTrunc:
11617 case Instruction::BitCast: {
11618 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11619 std::make_pair(std::numeric_limits<unsigned>::min(),
11620 std::numeric_limits<unsigned>::max()));
11621 if (ShuffleOrOp == Instruction::ZExt ||
11622 ShuffleOrOp == Instruction::SExt) {
11623 CastMaxMinBWSizes = std::make_pair(
11624 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11626 std::min<unsigned>(
11629 }
else if (ShuffleOrOp == Instruction::Trunc) {
11630 CastMaxMinBWSizes = std::make_pair(
11631 std::max<unsigned>(
11634 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11637 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11638 ReuseShuffleIndices);
11639 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11644 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11645 if (ShuffleOrOp == Instruction::Trunc) {
11646 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11647 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11648 ShuffleOrOp == Instruction::UIToFP) {
11649 unsigned NumSignBits =
11652 APInt
Mask = DB->getDemandedBits(OpI);
11653 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11655 if (NumSignBits * 2 >=
11657 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11661 case Instruction::ICmp:
11662 case Instruction::FCmp: {
11665 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11666 ReuseShuffleIndices);
11675 "Commutative Predicate mismatch");
11685 if (
Cmp->getPredicate() != P0)
11692 if (ShuffleOrOp == Instruction::ICmp) {
11693 unsigned NumSignBits0 =
11695 if (NumSignBits0 * 2 >=
11697 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11698 unsigned NumSignBits1 =
11700 if (NumSignBits1 * 2 >=
11702 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11706 case Instruction::Select:
11707 case Instruction::FNeg:
11708 case Instruction::Add:
11709 case Instruction::FAdd:
11710 case Instruction::Sub:
11711 case Instruction::FSub:
11712 case Instruction::Mul:
11713 case Instruction::FMul:
11714 case Instruction::UDiv:
11715 case Instruction::SDiv:
11716 case Instruction::FDiv:
11717 case Instruction::URem:
11718 case Instruction::SRem:
11719 case Instruction::FRem:
11720 case Instruction::Shl:
11721 case Instruction::LShr:
11722 case Instruction::AShr:
11723 case Instruction::And:
11724 case Instruction::Or:
11725 case Instruction::Xor:
11726 case Instruction::Freeze: {
11727 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11728 ReuseShuffleIndices);
11730 dbgs() <<
"SLP: added a new TreeEntry "
11731 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11742 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11745 case Instruction::GetElementPtr: {
11746 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11747 ReuseShuffleIndices);
11748 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
11756 case Instruction::Store: {
11757 bool Consecutive = CurrentOrder.empty();
11760 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11761 ReuseShuffleIndices, CurrentOrder);
11763 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
11767 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
11770 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
11773 case Instruction::Call: {
11779 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11780 ReuseShuffleIndices);
11781 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
11795 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11799 case Instruction::ShuffleVector: {
11800 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11801 ReuseShuffleIndices);
11802 if (S.isAltShuffle()) {
11803 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
11808 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11822 "Expected different main/alternate predicates.");
11852 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11870 for (
const auto *Ty : ST->elements())
11871 if (Ty != *ST->element_begin())
11873 N *= ST->getNumElements();
11874 EltTy = *ST->element_begin();
11876 N *= AT->getNumElements();
11877 EltTy = AT->getElementType();
11880 N *= VT->getNumElements();
11881 EltTy = VT->getElementType();
11887 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
11888 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11889 VTSize != DL->getTypeStoreSizeInBits(
T))
11896 bool ResizeAllowed)
const {
11898 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
11905 Value *Vec = E0->getOperand(0);
11907 CurrentOrder.
clear();
11911 if (E0->getOpcode() == Instruction::ExtractValue) {
11923 unsigned E = VL.
size();
11924 if (!ResizeAllowed && NElts !=
E)
11927 unsigned MinIdx = NElts, MaxIdx = 0;
11932 if (Inst->getOperand(0) != Vec)
11940 const unsigned ExtIdx = *Idx;
11941 if (ExtIdx >= NElts)
11943 Indices[
I] = ExtIdx;
11944 if (MinIdx > ExtIdx)
11946 if (MaxIdx < ExtIdx)
11949 if (MaxIdx - MinIdx + 1 >
E)
11951 if (MaxIdx + 1 <=
E)
11955 bool ShouldKeepOrder =
true;
11962 for (
unsigned I = 0;
I <
E; ++
I) {
11965 const unsigned ExtIdx = Indices[
I] - MinIdx;
11966 if (CurrentOrder[ExtIdx] !=
E) {
11967 CurrentOrder.
clear();
11970 ShouldKeepOrder &= ExtIdx ==
I;
11971 CurrentOrder[ExtIdx] =
I;
11973 if (ShouldKeepOrder)
11974 CurrentOrder.
clear();
11976 return ShouldKeepOrder;
11979bool BoUpSLP::areAllUsersVectorized(
11980 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
11981 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
11982 all_of(
I->users(), [
this](User *U) {
11983 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
11984 (isa<ExtractElementInst>(U) && MustGather.contains(U));
11988void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
11989 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
11990 SmallVectorImpl<Value *> *OpScalars,
11991 SmallVectorImpl<Value *> *AltScalars)
const {
11992 unsigned Sz = Scalars.size();
11994 SmallVector<int> OrderMask;
11995 if (!ReorderIndices.empty())
11997 for (
unsigned I = 0;
I < Sz; ++
I) {
11999 if (!ReorderIndices.empty())
12000 Idx = OrderMask[
I];
12004 if (IsAltOp(OpInst)) {
12005 Mask[
I] = Sz + Idx;
12014 if (!ReuseShuffleIndices.
empty()) {
12016 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12017 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12019 Mask.swap(NewMask);
12026 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12036 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12045 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12046 "CmpInst expected to match either main or alternate predicate or "
12048 return MainP !=
P && MainP != SwappedP;
12050 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12055 const auto *Op0 =
Ops.front();
12068 return CI->getValue().isPowerOf2();
12074 return CI->getValue().isNegatedPowerOf2();
12079 if (IsConstant && IsUniform)
12081 else if (IsConstant)
12083 else if (IsUniform)
12095class BaseShuffleAnalysis {
12097 Type *ScalarTy =
nullptr;
12099 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12107 unsigned getVF(
Value *V)
const {
12108 assert(V &&
"V cannot be nullptr");
12110 "V does not have FixedVectorType");
12111 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12113 unsigned VNumElements =
12115 assert(VNumElements > ScalarTyNumElements &&
12116 "the number of elements of V is not large enough");
12117 assert(VNumElements % ScalarTyNumElements == 0 &&
12118 "the number of elements of V is not a vectorized value");
12119 return VNumElements / ScalarTyNumElements;
12125 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12127 int Limit =
Mask.size();
12139 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12140 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12153 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12154 ArrayRef<int> ExtMask) {
12155 unsigned VF =
Mask.size();
12157 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12160 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12164 Mask.swap(NewMask);
12200 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12201 bool SinglePermute) {
12203 ShuffleVectorInst *IdentityOp =
nullptr;
12204 SmallVector<int> IdentityMask;
12213 if (isIdentityMask(Mask, SVTy,
false)) {
12214 if (!IdentityOp || !SinglePermute ||
12215 (isIdentityMask(Mask, SVTy,
true) &&
12217 IdentityMask.
size()))) {
12222 IdentityMask.
assign(Mask);
12242 if (SV->isZeroEltSplat()) {
12244 IdentityMask.
assign(Mask);
12246 int LocalVF =
Mask.size();
12249 LocalVF = SVOpTy->getNumElements();
12253 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12255 ExtMask[Idx] = SV->getMaskValue(
I);
12265 if (!IsOp1Undef && !IsOp2Undef) {
12267 for (
int &
I : Mask) {
12270 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12276 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12277 combineMasks(LocalVF, ShuffleMask, Mask);
12278 Mask.swap(ShuffleMask);
12280 Op = SV->getOperand(0);
12282 Op = SV->getOperand(1);
12285 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12290 "Expected masks of same sizes.");
12295 Mask.swap(IdentityMask);
12297 return SinglePermute &&
12300 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12301 Shuffle->isZeroEltSplat() &&
12305 Shuffle->getShuffleMask()[
P.index()] == 0;
12318 template <
typename T,
typename ShuffleBuilderTy>
12319 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12320 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12321 assert(V1 &&
"Expected at least one vector value.");
12323 SmallVector<int> NewMask(Mask);
12324 if (ScalarTyNumElements != 1) {
12330 Builder.resizeToMatch(V1, V2);
12331 int VF =
Mask.size();
12333 VF = FTy->getNumElements();
12344 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12346 CombinedMask1[
I] =
Mask[
I];
12348 CombinedMask2[
I] =
Mask[
I] - VF;
12355 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12356 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12362 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12365 ExtMask1[Idx] = SV1->getMaskValue(
I);
12369 ->getNumElements(),
12370 ExtMask1, UseMask::SecondArg);
12371 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12372 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12375 ExtMask2[Idx] = SV2->getMaskValue(
I);
12379 ->getNumElements(),
12380 ExtMask2, UseMask::SecondArg);
12381 if (SV1->getOperand(0)->getType() ==
12382 SV2->getOperand(0)->getType() &&
12383 SV1->getOperand(0)->getType() != SV1->getType() &&
12386 Op1 = SV1->getOperand(0);
12387 Op2 = SV2->getOperand(0);
12388 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12389 int LocalVF = ShuffleMask1.size();
12391 LocalVF = FTy->getNumElements();
12392 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12393 CombinedMask1.swap(ShuffleMask1);
12394 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12395 LocalVF = ShuffleMask2.size();
12397 LocalVF = FTy->getNumElements();
12398 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12399 CombinedMask2.swap(ShuffleMask2);
12402 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12403 Builder.resizeToMatch(Op1, Op2);
12405 ->getElementCount()
12406 .getKnownMinValue(),
12408 ->getElementCount()
12409 .getKnownMinValue());
12410 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12413 "Expected undefined mask element");
12414 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12423 return Builder.createIdentity(Op1);
12424 return Builder.createShuffleVector(
12429 return Builder.createPoison(
12431 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12432 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12435 return Builder.createShuffleVector(V1, NewMask);
12436 return Builder.createIdentity(V1);
12442 ArrayRef<int> Mask) {
12451static std::pair<InstructionCost, InstructionCost>
12462 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12471 ScalarCost =
TTI.getPointersChainCost(
12472 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12476 for (
Value *V : Ptrs) {
12477 if (V == BasePtr) {
12486 if (!
Ptr || !
Ptr->hasOneUse())
12490 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12495 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12496 TTI::PointersChainInfo::getKnownStride(),
12506 [](
const Value *V) {
12508 return Ptr && !
Ptr->hasAllConstantIndices();
12510 ? TTI::PointersChainInfo::getUnknownStride()
12511 : TTI::PointersChainInfo::getKnownStride();
12514 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
12518 if (It != Ptrs.
end())
12523 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
12524 BaseGEP->getPointerOperand(), Indices, VecTy,
12529 return std::make_pair(ScalarCost, VecCost);
12532void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12533 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12534 "Expected gather node without reordering.");
12536 SmallSet<size_t, 2> LoadKeyUsed;
12540 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12545 return VectorizableTree[Idx]->isSame(TE.Scalars);
12549 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
12554 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
12555 if (LIt != LoadsMap.
end()) {
12556 for (LoadInst *RLI : LIt->second) {
12558 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
12562 for (LoadInst *RLI : LIt->second) {
12564 LI->getPointerOperand(), *TLI)) {
12569 if (LIt->second.size() > 2) {
12571 hash_value(LIt->second.back()->getPointerOperand());
12580 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12581 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12582 bool IsOrdered =
true;
12583 unsigned NumInstructions = 0;
12587 size_t Key = 1, Idx = 1;
12595 auto &Container = SortedValues[
Key];
12596 if (IsOrdered && !KeyToIndex.
contains(V) &&
12599 ((Container.contains(Idx) &&
12600 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
12601 (!Container.empty() && !Container.contains(Idx) &&
12602 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12604 auto &KTI = KeyToIndex[
V];
12606 Container[Idx].push_back(V);
12611 if (!IsOrdered && NumInstructions > 1) {
12613 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12614 for (
const auto &
D : SortedValues) {
12615 for (
const auto &
P :
D.second) {
12617 for (
Value *V :
P.second) {
12618 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
12619 for (
auto [K, Idx] :
enumerate(Indices)) {
12620 TE.ReorderIndices[Cnt +
K] = Idx;
12621 TE.Scalars[Cnt +
K] =
V;
12623 Sz += Indices.
size();
12624 Cnt += Indices.
size();
12628 *TTI,
TE.Scalars.front()->getType(), Sz);
12632 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12640 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12645 auto *ScalarTy =
TE.Scalars.front()->getType();
12647 for (
auto [Idx, Sz] : SubVectors) {
12654 int Sz =
TE.Scalars.size();
12655 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
12656 TE.ReorderIndices.end());
12662 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12666 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12669 VecTy, ReorderMask);
12675 DemandedElts.clearBit(
I);
12677 ReorderMask[
I] =
I;
12679 ReorderMask[
I] =
I + Sz;
12685 if (!DemandedElts.isAllOnes())
12687 if (
Cost >= BVCost) {
12688 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
12690 TE.ReorderIndices.clear();
12697 const InstructionsState &S,
12703 return V->getType()->getScalarType()->isFloatingPointTy();
12705 "Can only convert to FMA for floating point types");
12706 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12711 for (
Value *V : VL) {
12715 if (S.isCopyableElement(
I))
12717 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12718 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12721 FMF &= FPCI->getFastMathFlags();
12725 if (!CheckForContractable(VL))
12728 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12735 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12737 if (!CheckForContractable(
Operands.front()))
12745 for (
Value *V : VL) {
12749 if (!S.isCopyableElement(
I))
12751 FMF &= FPCI->getFastMathFlags();
12752 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12756 if (S.isCopyableElement(V))
12759 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
12761 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
12768 FMF &= FPCI->getFastMathFlags();
12769 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12779 BaseGraphSize = VectorizableTree.size();
12781 class GraphTransformModeRAAI {
12782 bool &SavedIsGraphTransformMode;
12785 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
12786 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12787 IsGraphTransformMode =
true;
12789 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
12790 } TransformContext(IsGraphTransformMode);
12799 const InstructionsState &S) {
12803 I2->getOperand(
Op));
12805 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
12807 [](
const std::pair<Value *, Value *> &
P) {
12817 TreeEntry &E = *VectorizableTree[Idx];
12819 reorderGatherNode(E);
12824 constexpr unsigned VFLimit = 16;
12825 bool ForceLoadGather =
12826 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12827 return TE->isGather() && TE->hasState() &&
12828 TE->getOpcode() == Instruction::Load &&
12829 TE->getVectorFactor() < VFLimit;
12835 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
12844 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
12845 if (E.hasState()) {
12847 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12848 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12849 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12850 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12851 return is_contained(TEs, TE);
12858 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12859 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12860 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12861 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12862 return is_contained(TEs, TE);
12870 if (It != E.Scalars.end()) {
12872 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12873 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12874 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12875 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12876 return is_contained(TEs, TE);
12886 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
12887 TreeEntry &
E = *VectorizableTree[Idx];
12888 if (
E.isGather()) {
12891 unsigned MinVF =
getMinVF(2 * Sz);
12894 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
12895 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
12901 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
12904 if (CheckForSameVectorNodes(
E))
12908 unsigned StartIdx = 0;
12909 unsigned End = VL.
size();
12911 *TTI, VL.
front()->getType(), VL.
size() - 1);
12913 *TTI, VL.
front()->getType(), VF - 1)) {
12914 if (StartIdx + VF > End)
12917 bool AllStrided =
true;
12918 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
12923 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
12930 bool IsSplat =
isSplat(Slice);
12931 bool IsTwoRegisterSplat =
true;
12932 if (IsSplat && VF == 2) {
12935 IsTwoRegisterSplat = NumRegs2VF == 2;
12937 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
12945 (S.getOpcode() == Instruction::Load &&
12947 (S.getOpcode() != Instruction::Load &&
12953 if ((!UserIgnoreList ||
E.Idx != 0) &&
12954 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
12963 if (S.getOpcode() == Instruction::Load) {
12966 StridedPtrInfo SPtrInfo;
12968 PointerOps, SPtrInfo);
12979 if (UserIgnoreList &&
E.Idx == 0)
12984 }
else if (S.getOpcode() == Instruction::ExtractElement ||
12985 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
12987 !CheckOperandsProfitability(
13004 if (VF == 2 && AllStrided && Slices.
size() > 2)
13006 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13007 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13008 if (StartIdx == Cnt)
13009 StartIdx = Cnt + Sz;
13010 if (End == Cnt + Sz)
13013 for (
auto [Cnt, Sz] : Slices) {
13015 const TreeEntry *SameTE =
nullptr;
13017 It != Slice.
end()) {
13019 SameTE = getSameValuesTreeEntry(*It, Slice);
13021 unsigned PrevSize = VectorizableTree.size();
13022 [[maybe_unused]]
unsigned PrevEntriesSize =
13023 LoadEntriesToVectorize.size();
13024 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13025 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13026 VectorizableTree[PrevSize]->isGather() &&
13027 VectorizableTree[PrevSize]->hasState() &&
13028 VectorizableTree[PrevSize]->getOpcode() !=
13029 Instruction::ExtractElement &&
13031 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13033 VectorizableTree.pop_back();
13034 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13035 "LoadEntriesToVectorize expected to remain the same");
13038 AddCombinedNode(PrevSize, Cnt, Sz);
13042 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13043 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13045 E.ReorderIndices.clear();
13050 switch (
E.getOpcode()) {
13051 case Instruction::Load: {
13054 if (
E.State != TreeEntry::Vectorize)
13056 Type *ScalarTy =
E.getMainOp()->getType();
13062 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13063 SmallVector<int>
Mask;
13067 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13068 BaseLI->getPointerAddressSpace(),
CostKind,
13072 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13073 false, CommonAlignment,
CostKind, BaseLI);
13078 ->getPointerOperand()
13080 StridedPtrInfo SPtrInfo;
13081 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13082 SPtrInfo.Ty = VecTy;
13083 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13084 E.State = TreeEntry::StridedVectorize;
13089 case Instruction::Store: {
13097 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13098 SmallVector<int>
Mask;
13102 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13103 BaseSI->getPointerAddressSpace(),
CostKind,
13107 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13108 false, CommonAlignment,
CostKind, BaseSI);
13109 if (StridedCost < OriginalVecCost)
13112 E.State = TreeEntry::StridedVectorize;
13113 }
else if (!
E.ReorderIndices.empty()) {
13115 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13117 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13118 if (
Mask.size() < 4)
13122 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13123 TTI.isLegalInterleavedAccessType(
13124 VecTy, Factor, BaseSI->getAlign(),
13125 BaseSI->getPointerAddressSpace()))
13131 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13132 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13133 if (InterleaveFactor != 0)
13134 E.setInterleave(InterleaveFactor);
13138 case Instruction::Select: {
13139 if (
E.State != TreeEntry::Vectorize)
13145 E.CombinedOp = TreeEntry::MinMax;
13146 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13147 if (SelectOnly && CondEntry->UserTreeIndex &&
13148 CondEntry->State == TreeEntry::Vectorize) {
13150 CondEntry->State = TreeEntry::CombinedVectorize;
13154 case Instruction::FSub:
13155 case Instruction::FAdd: {
13157 if (
E.State != TreeEntry::Vectorize ||
13158 !
E.getOperations().isAddSubLikeOp())
13164 E.CombinedOp = TreeEntry::FMulAdd;
13165 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13166 if (FMulEntry->UserTreeIndex &&
13167 FMulEntry->State == TreeEntry::Vectorize) {
13169 FMulEntry->State = TreeEntry::CombinedVectorize;
13178 if (LoadEntriesToVectorize.empty()) {
13180 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13181 VectorizableTree.front()->getOpcode() == Instruction::Load)
13184 constexpr unsigned SmallTree = 3;
13185 constexpr unsigned SmallVF = 2;
13186 if ((VectorizableTree.size() <= SmallTree &&
13187 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13188 (VectorizableTree.size() <= 2 && UserIgnoreList))
13191 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13195 [](
const std::unique_ptr<TreeEntry> &TE) {
13196 return TE->isGather() &&
TE->hasState() &&
13197 TE->getOpcode() == Instruction::Load &&
13205 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13209 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13210 TreeEntry &
E = *
TE;
13211 if (
E.isGather() &&
13212 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13213 (!
E.hasState() &&
any_of(
E.Scalars,
13215 return isa<LoadInst>(V) &&
13216 !isVectorized(V) &&
13217 !isDeleted(cast<Instruction>(V));
13220 for (
Value *V :
E.Scalars) {
13227 *
this, V, *DL, *SE, *TTI,
13228 GatheredLoads[std::make_tuple(
13236 if (!GatheredLoads.
empty())
13237 tryToVectorizeGatheredLoads(GatheredLoads);
13247 bool IsFinalized =
false;
13260 bool SameNodesEstimated =
true;
13263 if (Ty->getScalarType()->isPointerTy()) {
13267 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13268 Ty->getScalarType());
13286 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13289 count(VL, *It) > 1 &&
13291 if (!NeedShuffle) {
13294 return TTI.getShuffleCost(
13299 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13300 CostKind, std::distance(VL.
begin(), It),
13306 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13309 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13313 VecTy, ShuffleMask, CostKind,
13317 return GatherCost +
13320 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13328 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13329 unsigned NumParts) {
13330 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13332 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13333 auto *EE = dyn_cast<ExtractElementInst>(V);
13336 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13339 return std::max(Sz, VecTy->getNumElements());
13346 -> std::optional<TTI::ShuffleKind> {
13347 if (NumElts <= EltsPerVector)
13348 return std::nullopt;
13350 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13352 if (I == PoisonMaskElem)
13354 return std::min(S, I);
13357 int OffsetReg1 = OffsetReg0;
13361 int FirstRegId = -1;
13362 Indices.assign(1, OffsetReg0);
13366 int Idx =
I - OffsetReg0;
13368 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13369 if (FirstRegId < 0)
13370 FirstRegId = RegId;
13371 RegIndices.
insert(RegId);
13372 if (RegIndices.
size() > 2)
13373 return std::nullopt;
13374 if (RegIndices.
size() == 2) {
13376 if (Indices.
size() == 1) {
13379 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13380 [&](
int S,
int I) {
13381 if (I == PoisonMaskElem)
13383 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13384 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13385 if (RegId == FirstRegId)
13387 return std::min(S, I);
13390 unsigned Index = OffsetReg1 % NumElts;
13391 Indices.push_back(Index);
13392 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13394 Idx =
I - OffsetReg1;
13396 I = (Idx % NumElts) % EltsPerVector +
13397 (RegId == FirstRegId ? 0 : EltsPerVector);
13399 return ShuffleKind;
13407 if (!ShuffleKinds[Part])
13410 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13415 std::optional<TTI::ShuffleKind> RegShuffleKind =
13416 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13417 if (!RegShuffleKind) {
13420 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13433 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13434 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13435 assert((Idx + SubVecSize) <= BaseVF &&
13436 "SK_ExtractSubvector index out of range");
13446 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13447 if (OriginalCost < Cost)
13448 Cost = OriginalCost;
13455 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13457 unsigned SliceSize) {
13458 if (SameNodesEstimated) {
13464 if ((InVectors.size() == 2 &&
13468 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13471 "Expected all poisoned elements.");
13473 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13478 Cost += createShuffle(InVectors.front(),
13479 InVectors.size() == 1 ?
nullptr : InVectors.back(),
13481 transformMaskAfterShuffle(CommonMask, CommonMask);
13482 }
else if (InVectors.size() == 2) {
13483 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13484 transformMaskAfterShuffle(CommonMask, CommonMask);
13486 SameNodesEstimated =
false;
13487 if (!E2 && InVectors.size() == 1) {
13488 unsigned VF = E1.getVectorFactor();
13490 VF = std::max(VF, getVF(V1));
13493 VF = std::max(VF, E->getVectorFactor());
13495 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13497 CommonMask[Idx] = Mask[Idx] + VF;
13498 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13499 transformMaskAfterShuffle(CommonMask, CommonMask);
13501 auto P = InVectors.front();
13502 Cost += createShuffle(&E1, E2, Mask);
13503 unsigned VF = Mask.size();
13509 VF = std::max(VF, E->getVectorFactor());
13511 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13513 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13514 Cost += createShuffle(
P, InVectors.front(), CommonMask);
13515 transformMaskAfterShuffle(CommonMask, CommonMask);
13519 class ShuffleCostBuilder {
13522 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13524 return Mask.empty() ||
13525 (VF == Mask.size() &&
13533 ~ShuffleCostBuilder() =
default;
13539 if (isEmptyOrIdentity(Mask, VF))
13548 if (isEmptyOrIdentity(Mask, VF))
13557 void resizeToMatch(
Value *&,
Value *&)
const {}
13567 ShuffleCostBuilder Builder(TTI);
13570 unsigned CommonVF = Mask.size();
13572 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13576 Type *EScalarTy = E.Scalars.front()->getType();
13577 bool IsSigned =
true;
13578 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13580 IsSigned = It->second.second;
13582 if (EScalarTy != ScalarTy) {
13583 unsigned CastOpcode = Instruction::Trunc;
13584 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13585 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13587 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13588 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
13598 Type *EScalarTy = VecTy->getElementType();
13599 if (EScalarTy != ScalarTy) {
13601 unsigned CastOpcode = Instruction::Trunc;
13602 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13603 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13605 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13606 return TTI.getCastInstrCost(
13612 if (!V1 && !V2 && !P2.
isNull()) {
13615 unsigned VF = E->getVectorFactor();
13617 CommonVF = std::max(VF, E2->getVectorFactor());
13620 return Idx < 2 * static_cast<int>(CommonVF);
13622 "All elements in mask must be less than 2 * CommonVF.");
13623 if (E->Scalars.size() == E2->Scalars.size()) {
13627 for (
int &Idx : CommonMask) {
13630 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13632 else if (Idx >=
static_cast<int>(CommonVF))
13633 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13637 CommonVF = E->Scalars.size();
13638 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13639 GetNodeMinBWAffectedCost(*E2, CommonVF);
13641 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13642 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13645 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13646 }
else if (!V1 && P2.
isNull()) {
13649 unsigned VF = E->getVectorFactor();
13653 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13654 "All elements in mask must be less than CommonVF.");
13655 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13657 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13658 for (
int &Idx : CommonMask) {
13662 CommonVF = E->Scalars.size();
13663 }
else if (
unsigned Factor = E->getInterleaveFactor();
13664 Factor > 0 && E->Scalars.size() != Mask.size() &&
13668 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13670 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13673 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13674 CommonVF == CommonMask.size() &&
13676 [](
const auto &&
P) {
13678 static_cast<unsigned>(
P.value()) !=
P.index();
13686 }
else if (V1 && P2.
isNull()) {
13688 ExtraCost += GetValueMinBWAffectedCost(V1);
13689 CommonVF = getVF(V1);
13692 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13693 "All elements in mask must be less than CommonVF.");
13694 }
else if (V1 && !V2) {
13696 unsigned VF = getVF(V1);
13698 CommonVF = std::max(VF, E2->getVectorFactor());
13701 return Idx < 2 * static_cast<int>(CommonVF);
13703 "All elements in mask must be less than 2 * CommonVF.");
13704 if (E2->Scalars.size() == VF && VF != CommonVF) {
13706 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13707 for (
int &Idx : CommonMask) {
13710 if (Idx >=
static_cast<int>(CommonVF))
13711 Idx = E2Mask[Idx - CommonVF] + VF;
13715 ExtraCost += GetValueMinBWAffectedCost(V1);
13717 ExtraCost += GetNodeMinBWAffectedCost(
13718 *E2, std::min(CommonVF, E2->getVectorFactor()));
13719 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13720 }
else if (!V1 && V2) {
13722 unsigned VF = getVF(V2);
13724 CommonVF = std::max(VF, E1->getVectorFactor());
13727 return Idx < 2 * static_cast<int>(CommonVF);
13729 "All elements in mask must be less than 2 * CommonVF.");
13730 if (E1->Scalars.size() == VF && VF != CommonVF) {
13732 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
13733 for (
int &Idx : CommonMask) {
13736 if (Idx >=
static_cast<int>(CommonVF))
13737 Idx = E1Mask[Idx - CommonVF] + VF;
13743 ExtraCost += GetNodeMinBWAffectedCost(
13744 *E1, std::min(CommonVF, E1->getVectorFactor()));
13746 ExtraCost += GetValueMinBWAffectedCost(V2);
13747 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13749 assert(V1 && V2 &&
"Expected both vectors.");
13750 unsigned VF = getVF(V1);
13751 CommonVF = std::max(VF, getVF(V2));
13754 return Idx < 2 * static_cast<int>(CommonVF);
13756 "All elements in mask must be less than 2 * CommonVF.");
13758 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13761 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13766 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13769 InVectors.front() =
13771 if (InVectors.size() == 2)
13772 InVectors.pop_back();
13773 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13774 V1, V2, CommonMask, Builder, ScalarTy);
13781 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13782 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13783 CheckedExtracts(CheckedExtracts) {}
13785 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13786 unsigned NumParts,
bool &UseVecBaseAsInput) {
13787 UseVecBaseAsInput =
false;
13790 Value *VecBase =
nullptr;
13792 if (!E->ReorderIndices.empty()) {
13794 E->ReorderIndices.end());
13799 bool PrevNodeFound =
any_of(
13800 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13801 [&](
const std::unique_ptr<TreeEntry> &TE) {
13802 return ((TE->hasState() && !TE->isAltShuffle() &&
13803 TE->getOpcode() == Instruction::ExtractElement) ||
13805 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13806 return VL.size() > Data.index() &&
13807 (Mask[Data.index()] == PoisonMaskElem ||
13808 isa<UndefValue>(VL[Data.index()]) ||
13809 Data.value() == VL[Data.index()]);
13817 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13831 VecBase = EE->getVectorOperand();
13832 UniqueBases.
insert(VecBase);
13834 if (!CheckedExtracts.
insert(V).second ||
13838 return isa<GetElementPtrInst>(U) &&
13839 !R.areAllUsersVectorized(cast<Instruction>(U),
13847 unsigned Idx = *EEIdx;
13849 if (EE->hasOneUse() || !PrevNodeFound) {
13855 Cost -= TTI.getExtractWithExtendCost(
13856 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13859 Cost += TTI.getCastInstrCost(
13860 Ext->getOpcode(), Ext->getType(), EE->getType(),
13865 APInt &DemandedElts =
13866 VectorOpsToExtracts
13869 .first->getSecond();
13870 DemandedElts.
setBit(Idx);
13873 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13875 DemandedElts,
false,
13883 if (!PrevNodeFound)
13884 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13887 transformMaskAfterShuffle(CommonMask, CommonMask);
13888 SameNodesEstimated =
false;
13889 if (NumParts != 1 && UniqueBases.
size() != 1) {
13890 UseVecBaseAsInput =
true;
13898 std::optional<InstructionCost>
13902 return std::nullopt;
13906 IsFinalized =
false;
13907 CommonMask.clear();
13910 VectorizedVals.clear();
13911 SameNodesEstimated =
true;
13917 return Idx < static_cast<int>(E1.getVectorFactor());
13919 "Expected single vector shuffle mask.");
13923 if (InVectors.empty()) {
13924 CommonMask.assign(Mask.begin(), Mask.end());
13925 InVectors.assign({&E1, &E2});
13928 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
13934 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13935 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13938 if (InVectors.empty()) {
13939 CommonMask.assign(Mask.begin(), Mask.end());
13940 InVectors.assign(1, &E1);
13943 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
13949 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13950 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
13951 if (!SameNodesEstimated && InVectors.size() == 1)
13952 InVectors.emplace_back(&E1);
13958 assert(InVectors.size() == 1 &&
13965 ->getOrdered(
P.index()));
13966 return EI->getVectorOperand() == V1 ||
13967 EI->getVectorOperand() == V2;
13969 "Expected extractelement vectors.");
13973 if (InVectors.empty()) {
13974 assert(CommonMask.empty() && !ForExtracts &&
13975 "Expected empty input mask/vectors.");
13976 CommonMask.assign(Mask.begin(), Mask.end());
13977 InVectors.assign(1, V1);
13983 !CommonMask.empty() &&
13987 ->getOrdered(
P.index());
13989 return P.value() == Mask[
P.index()] ||
13994 return EI->getVectorOperand() == V1;
13996 "Expected only tree entry for extractelement vectors.");
13999 assert(!InVectors.empty() && !CommonMask.empty() &&
14000 "Expected only tree entries from extracts/reused buildvectors.");
14001 unsigned VF = getVF(V1);
14002 if (InVectors.size() == 2) {
14003 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14004 transformMaskAfterShuffle(CommonMask, CommonMask);
14005 VF = std::max<unsigned>(VF, CommonMask.size());
14006 }
else if (
const auto *InTE =
14007 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14008 VF = std::max(VF, InTE->getVectorFactor());
14012 ->getNumElements());
14014 InVectors.push_back(V1);
14015 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14017 CommonMask[Idx] = Mask[Idx] + VF;
14020 Value *Root =
nullptr) {
14021 Cost += getBuildVectorCost(VL, Root);
14025 unsigned VF = VL.
size();
14027 VF = std::min(VF, MaskVF);
14028 Type *VLScalarTy = VL.
front()->getType();
14052 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14058 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14063 IsFinalized =
true;
14066 if (InVectors.
size() == 2)
14067 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14069 Cost += createShuffle(Vec,
nullptr, CommonMask);
14070 transformMaskAfterShuffle(CommonMask, CommonMask);
14072 "Expected vector length for the final value before action.");
14075 Cost += createShuffle(V1, V2, Mask);
14078 InVectors.
front() = V;
14080 if (!SubVectors.empty()) {
14082 if (InVectors.
size() == 2)
14083 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14085 Cost += createShuffle(Vec,
nullptr, CommonMask);
14086 transformMaskAfterShuffle(CommonMask, CommonMask);
14088 if (!SubVectorsMask.
empty()) {
14090 "Expected same size of masks for subvectors and common mask.");
14092 copy(SubVectorsMask, SVMask.begin());
14093 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14096 I1 = I2 + CommonMask.
size();
14103 for (
auto [
E, Idx] : SubVectors) {
14104 Type *EScalarTy =
E->Scalars.front()->getType();
14105 bool IsSigned =
true;
14106 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14109 IsSigned = It->second.second;
14111 if (ScalarTy != EScalarTy) {
14112 unsigned CastOpcode = Instruction::Trunc;
14113 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14114 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14116 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14117 Cost += TTI.getCastInstrCost(
14126 if (!CommonMask.
empty()) {
14127 std::iota(std::next(CommonMask.
begin(), Idx),
14128 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14134 if (!ExtMask.
empty()) {
14135 if (CommonMask.
empty()) {
14139 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14142 NewMask[
I] = CommonMask[ExtMask[
I]];
14144 CommonMask.
swap(NewMask);
14147 if (CommonMask.
empty()) {
14148 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14152 createShuffle(InVectors.
front(),
14153 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14158 assert((IsFinalized || CommonMask.empty()) &&
14159 "Shuffle construction must be finalized.");
14163const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14164 unsigned Idx)
const {
14165 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14166 assert(
Op->isSame(E->getOperand(Idx)) &&
"Operands mismatch!");
14171 if (TE.State == TreeEntry::ScatterVectorize ||
14172 TE.State == TreeEntry::StridedVectorize)
14174 if (TE.State == TreeEntry::CompressVectorize)
14176 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14177 !TE.isAltShuffle()) {
14178 if (TE.ReorderIndices.empty())
14190 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14200 auto It = MinBWs.find(
E);
14201 Type *OrigScalarTy = ScalarTy;
14202 if (It != MinBWs.end()) {
14209 unsigned EntryVF =
E->getVectorFactor();
14212 if (
E->isGather()) {
14218 ScalarTy = VL.
front()->getType();
14219 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14220 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14222 if (
E->State == TreeEntry::SplitVectorize) {
14223 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14224 "Expected exactly 2 combined entries.");
14225 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14227 if (
E->ReorderIndices.empty()) {
14230 E->CombinedEntriesWithIndices.back().second,
14233 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14234 ->getVectorFactor()));
14236 unsigned CommonVF =
14237 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14238 ->getVectorFactor(),
14239 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14240 ->getVectorFactor());
14245 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14249 SmallVector<int>
Mask;
14250 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14251 (
E->State != TreeEntry::StridedVectorize ||
14253 SmallVector<int> NewMask;
14254 if (
E->getOpcode() == Instruction::Store) {
14256 NewMask.
resize(
E->ReorderIndices.size());
14263 if (!
E->ReuseShuffleIndices.empty())
14268 assert((
E->State == TreeEntry::Vectorize ||
14269 E->State == TreeEntry::ScatterVectorize ||
14270 E->State == TreeEntry::StridedVectorize ||
14271 E->State == TreeEntry::CompressVectorize) &&
14272 "Unhandled state");
14275 (
E->getOpcode() == Instruction::GetElementPtr &&
14276 E->getMainOp()->getType()->isPointerTy()) ||
14277 E->hasCopyableElements()) &&
14280 unsigned ShuffleOrOp =
14281 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14282 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14283 ShuffleOrOp =
E->CombinedOp;
14284 SmallSetVector<Value *, 16> UniqueValues(VL.
begin(), VL.
end());
14285 const unsigned Sz = UniqueValues.size();
14286 SmallBitVector UsedScalars(Sz,
false);
14287 for (
unsigned I = 0;
I < Sz; ++
I) {
14289 !
E->isCopyableElement(UniqueValues[
I]) &&
14290 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14292 UsedScalars.set(
I);
14294 auto GetCastContextHint = [&](
Value *
V) {
14296 return getCastContextHint(*OpTEs.front());
14297 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14298 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14299 !SrcState.isAltShuffle())
14312 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14314 for (
unsigned I = 0;
I < Sz; ++
I) {
14315 if (UsedScalars.test(
I))
14317 ScalarCost += ScalarEltCost(
I);
14326 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14328 if (!EI.UserTE->hasState() ||
14329 EI.UserTE->getOpcode() != Instruction::Select ||
14331 auto UserBWIt = MinBWs.find(EI.UserTE);
14332 Type *UserScalarTy =
14333 (EI.UserTE->isGather() ||
14334 EI.UserTE->State == TreeEntry::SplitVectorize)
14335 ? EI.UserTE->Scalars.front()->getType()
14336 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14337 if (UserBWIt != MinBWs.end())
14339 UserBWIt->second.first);
14340 if (ScalarTy != UserScalarTy) {
14341 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14342 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14343 unsigned VecOpcode;
14345 if (BWSz > SrcBWSz)
14346 VecOpcode = Instruction::Trunc;
14349 It->second.second ? Instruction::SExt : Instruction::ZExt;
14351 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14356 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14357 ScalarCost,
"Calculated costs for Tree"));
14358 return VecCost - ScalarCost;
14363 assert((
E->State == TreeEntry::Vectorize ||
14364 E->State == TreeEntry::StridedVectorize ||
14365 E->State == TreeEntry::CompressVectorize) &&
14366 "Entry state expected to be Vectorize, StridedVectorize or "
14367 "MaskedLoadCompressVectorize here.");
14371 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14372 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14373 "Calculated GEPs cost for Tree"));
14375 return VecCost - ScalarCost;
14382 Type *CanonicalType = Ty;
14388 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14389 {CanonicalType, CanonicalType});
14391 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14394 if (VI && SelectOnly) {
14396 "Expected only for scalar type.");
14399 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14400 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14401 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14405 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14410 switch (ShuffleOrOp) {
14411 case Instruction::PHI: {
14414 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14415 for (
Value *V : UniqueValues) {
14421 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14425 if (
const TreeEntry *OpTE =
14427 if (CountedOps.
insert(OpTE).second &&
14428 !OpTE->ReuseShuffleIndices.empty())
14429 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14430 OpTE->Scalars.size());
14433 return CommonCost - ScalarCost;
14435 case Instruction::ExtractValue:
14436 case Instruction::ExtractElement: {
14437 APInt DemandedElts;
14439 auto GetScalarCost = [&](
unsigned Idx) {
14445 if (ShuffleOrOp == Instruction::ExtractElement) {
14447 SrcVecTy = EE->getVectorOperandType();
14450 Type *AggregateTy = EV->getAggregateOperand()->getType();
14453 NumElts = ATy->getNumElements();
14459 if (
I->hasOneUse()) {
14469 Cost -= TTI->getCastInstrCost(
14470 Ext->getOpcode(),
Ext->getType(),
I->getType(),
14475 if (DemandedElts.
isZero())
14481 return CommonCost - (DemandedElts.
isZero()
14483 : TTI.getScalarizationOverhead(
14484 SrcVecTy, DemandedElts,
false,
14487 return GetCostDiff(GetScalarCost, GetVectorCost);
14489 case Instruction::InsertElement: {
14490 assert(
E->ReuseShuffleIndices.empty() &&
14491 "Unique insertelements only are expected.");
14493 unsigned const NumElts = SrcVecTy->getNumElements();
14494 unsigned const NumScalars = VL.
size();
14500 unsigned OffsetEnd = OffsetBeg;
14501 InsertMask[OffsetBeg] = 0;
14504 if (OffsetBeg > Idx)
14506 else if (OffsetEnd < Idx)
14508 InsertMask[Idx] =
I + 1;
14511 if (NumOfParts > 0 && NumOfParts < NumElts)
14512 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14513 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14515 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14516 unsigned InsertVecSz = std::min<unsigned>(
14518 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14519 bool IsWholeSubvector =
14520 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14524 if (OffsetBeg + InsertVecSz > VecSz) {
14527 InsertVecSz = VecSz;
14532 SmallVector<int>
Mask;
14533 if (!
E->ReorderIndices.empty()) {
14538 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14540 bool IsIdentity =
true;
14542 Mask.swap(PrevMask);
14543 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14545 DemandedElts.
setBit(InsertIdx);
14546 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14547 Mask[InsertIdx - OffsetBeg] =
I;
14549 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14563 InsertVecTy, Mask);
14565 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14571 SmallBitVector InMask =
14573 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14574 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14575 if (InsertVecSz != VecSz) {
14580 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
14582 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
14586 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14595 case Instruction::ZExt:
14596 case Instruction::SExt:
14597 case Instruction::FPToUI:
14598 case Instruction::FPToSI:
14599 case Instruction::FPExt:
14600 case Instruction::PtrToInt:
14601 case Instruction::IntToPtr:
14602 case Instruction::SIToFP:
14603 case Instruction::UIToFP:
14604 case Instruction::Trunc:
14605 case Instruction::FPTrunc:
14606 case Instruction::BitCast: {
14607 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
14610 unsigned Opcode = ShuffleOrOp;
14611 unsigned VecOpcode = Opcode;
14613 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14615 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14616 if (SrcIt != MinBWs.end()) {
14617 SrcBWSz = SrcIt->second.first;
14623 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
14624 if (BWSz == SrcBWSz) {
14625 VecOpcode = Instruction::BitCast;
14626 }
else if (BWSz < SrcBWSz) {
14627 VecOpcode = Instruction::Trunc;
14628 }
else if (It != MinBWs.end()) {
14629 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14630 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14631 }
else if (SrcIt != MinBWs.end()) {
14632 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14634 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14636 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14637 !SrcIt->second.second) {
14638 VecOpcode = Instruction::UIToFP;
14641 assert(Idx == 0 &&
"Expected 0 index only");
14642 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
14649 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14651 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14654 bool IsArithmeticExtendedReduction =
14655 E->Idx == 0 && UserIgnoreList &&
14658 return is_contained({Instruction::Add, Instruction::FAdd,
14659 Instruction::Mul, Instruction::FMul,
14660 Instruction::And, Instruction::Or,
14664 if (IsArithmeticExtendedReduction &&
14665 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14667 return CommonCost +
14668 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
14669 VecOpcode == Opcode ? VI :
nullptr);
14671 return GetCostDiff(GetScalarCost, GetVectorCost);
14673 case Instruction::FCmp:
14674 case Instruction::ICmp:
14675 case Instruction::Select: {
14676 CmpPredicate VecPred, SwappedVecPred;
14679 match(VL0, MatchCmp))
14685 auto GetScalarCost = [&](
unsigned Idx) {
14695 !
match(VI, MatchCmp)) ||
14703 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14704 CostKind, getOperandInfo(
VI->getOperand(0)),
14705 getOperandInfo(
VI->getOperand(1)), VI);
14716 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
14717 CostKind, getOperandInfo(
E->getOperand(0)),
14718 getOperandInfo(
E->getOperand(1)), VL0);
14722 unsigned CondNumElements = CondType->getNumElements();
14724 assert(VecTyNumElements >= CondNumElements &&
14725 VecTyNumElements % CondNumElements == 0 &&
14726 "Cannot vectorize Instruction::Select");
14727 if (CondNumElements != VecTyNumElements) {
14736 return VecCost + CommonCost;
14738 return GetCostDiff(GetScalarCost, GetVectorCost);
14740 case TreeEntry::MinMax: {
14741 auto GetScalarCost = [&](
unsigned Idx) {
14742 return GetMinMaxCost(OrigScalarTy);
14746 return VecCost + CommonCost;
14748 return GetCostDiff(GetScalarCost, GetVectorCost);
14750 case TreeEntry::FMulAdd: {
14751 auto GetScalarCost = [&](
unsigned Idx) {
14754 return GetFMulAddCost(
E->getOperations(),
14760 for (
Value *V :
E->Scalars) {
14762 FMF &= FPCI->getFastMathFlags();
14764 FMF &= FPCIOp->getFastMathFlags();
14767 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14768 {VecTy, VecTy, VecTy}, FMF);
14770 return VecCost + CommonCost;
14772 return GetCostDiff(GetScalarCost, GetVectorCost);
14774 case Instruction::FNeg:
14775 case Instruction::Add:
14776 case Instruction::FAdd:
14777 case Instruction::Sub:
14778 case Instruction::FSub:
14779 case Instruction::Mul:
14780 case Instruction::FMul:
14781 case Instruction::UDiv:
14782 case Instruction::SDiv:
14783 case Instruction::FDiv:
14784 case Instruction::URem:
14785 case Instruction::SRem:
14786 case Instruction::FRem:
14787 case Instruction::Shl:
14788 case Instruction::LShr:
14789 case Instruction::AShr:
14790 case Instruction::And:
14791 case Instruction::Or:
14792 case Instruction::Xor: {
14793 auto GetScalarCost = [&](
unsigned Idx) {
14800 Value *Op1 =
E->getOperand(0)[Idx];
14802 SmallVector<const Value *, 2>
Operands(1, Op1);
14806 Op2 =
E->getOperand(1)[Idx];
14814 I && (ShuffleOrOp == Instruction::FAdd ||
14815 ShuffleOrOp == Instruction::FSub)) {
14823 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14828 return CI && CI->getValue().countr_one() >= It->second.first;
14836 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
14837 Op2Info, {},
nullptr, TLI) +
14840 return GetCostDiff(GetScalarCost, GetVectorCost);
14842 case Instruction::GetElementPtr: {
14843 return CommonCost + GetGEPCostDiff(VL, VL0);
14845 case Instruction::Load: {
14846 auto GetScalarCost = [&](
unsigned Idx) {
14848 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14849 VI->getAlign(),
VI->getPointerAddressSpace(),
14855 switch (
E->State) {
14856 case TreeEntry::Vectorize:
14857 if (
unsigned Factor =
E->getInterleaveFactor()) {
14858 VecLdCost = TTI->getInterleavedMemoryOpCost(
14859 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14860 LI0->getPointerAddressSpace(),
CostKind);
14863 VecLdCost = TTI->getMemoryOpCost(
14864 Instruction::Load, VecTy, LI0->getAlign(),
14868 case TreeEntry::StridedVectorize: {
14869 Align CommonAlignment =
14871 VecLdCost = TTI->getStridedMemoryOpCost(
14872 Instruction::Load, VecTy, LI0->getPointerOperand(),
14873 false, CommonAlignment,
CostKind);
14876 case TreeEntry::CompressVectorize: {
14878 unsigned InterleaveFactor;
14879 SmallVector<int> CompressMask;
14882 if (!
E->ReorderIndices.empty()) {
14883 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
14884 E->ReorderIndices.end());
14891 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
14892 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
14893 CompressMask, LoadVecTy);
14894 assert(IsVectorized &&
"Failed to vectorize load");
14895 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
14896 InterleaveFactor, IsMasked);
14897 Align CommonAlignment = LI0->getAlign();
14898 if (InterleaveFactor) {
14899 VecLdCost = TTI->getInterleavedMemoryOpCost(
14900 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14901 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
14902 }
else if (IsMasked) {
14903 VecLdCost = TTI->getMaskedMemoryOpCost(
14904 Instruction::Load, LoadVecTy, CommonAlignment,
14905 LI0->getPointerAddressSpace(),
CostKind);
14908 LoadVecTy, CompressMask,
CostKind);
14910 VecLdCost = TTI->getMemoryOpCost(
14911 Instruction::Load, LoadVecTy, CommonAlignment,
14915 LoadVecTy, CompressMask,
CostKind);
14919 case TreeEntry::ScatterVectorize: {
14920 Align CommonAlignment =
14922 VecLdCost = TTI->getGatherScatterOpCost(
14923 Instruction::Load, VecTy, LI0->getPointerOperand(),
14924 false, CommonAlignment,
CostKind);
14927 case TreeEntry::CombinedVectorize:
14928 case TreeEntry::SplitVectorize:
14929 case TreeEntry::NeedToGather:
14932 return VecLdCost + CommonCost;
14938 if (
E->State == TreeEntry::ScatterVectorize)
14945 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14947 case Instruction::Store: {
14948 bool IsReorder = !
E->ReorderIndices.empty();
14949 auto GetScalarCost = [=](
unsigned Idx) {
14952 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
14953 VI->getAlign(),
VI->getPointerAddressSpace(),
14961 if (
E->State == TreeEntry::StridedVectorize) {
14962 Align CommonAlignment =
14964 VecStCost = TTI->getStridedMemoryOpCost(
14965 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
14966 false, CommonAlignment,
CostKind);
14968 assert(
E->State == TreeEntry::Vectorize &&
14969 "Expected either strided or consecutive stores.");
14970 if (
unsigned Factor =
E->getInterleaveFactor()) {
14971 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
14972 "No reused shuffles expected");
14974 VecStCost = TTI->getInterleavedMemoryOpCost(
14975 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
14976 BaseSI->getPointerAddressSpace(),
CostKind);
14979 VecStCost = TTI->getMemoryOpCost(
14980 Instruction::Store, VecTy, BaseSI->getAlign(),
14981 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
14984 return VecStCost + CommonCost;
14988 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
14992 return GetCostDiff(GetScalarCost, GetVectorCost) +
14993 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
14995 case Instruction::Call: {
14996 auto GetScalarCost = [&](
unsigned Idx) {
15000 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15001 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15011 CI,
ID, VecTy->getNumElements(),
15012 It != MinBWs.end() ? It->second.first : 0, TTI);
15014 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15016 return GetCostDiff(GetScalarCost, GetVectorCost);
15018 case Instruction::ShuffleVector: {
15026 "Invalid Shuffle Vector Operand");
15029 auto TryFindNodeWithEqualOperands = [=]() {
15030 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15033 if (
TE->hasState() &&
TE->isAltShuffle() &&
15034 ((
TE->getOpcode() ==
E->getOpcode() &&
15035 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15036 (
TE->getOpcode() ==
E->getAltOpcode() &&
15037 TE->getAltOpcode() ==
E->getOpcode())) &&
15038 TE->hasEqualOperands(*
E))
15043 auto GetScalarCost = [&](
unsigned Idx) {
15048 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15049 "Unexpected main/alternate opcode");
15051 return TTI->getInstructionCost(VI,
CostKind);
15059 if (TryFindNodeWithEqualOperands()) {
15061 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15068 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15070 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15073 VecCost = TTIRef.getCmpSelInstrCost(
15074 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15075 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15077 VecCost += TTIRef.getCmpSelInstrCost(
15078 E->getOpcode(), VecTy, MaskTy,
15080 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15083 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15086 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15087 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15089 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15090 if (SrcIt != MinBWs.end()) {
15091 SrcBWSz = SrcIt->second.first;
15095 if (BWSz <= SrcBWSz) {
15096 if (BWSz < SrcBWSz)
15098 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15102 <<
"SLP: alternate extension, which should be truncated.\n";
15108 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15111 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15114 SmallVector<int>
Mask;
15115 E->buildAltOpShuffleMask(
15116 [&](Instruction *
I) {
15117 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15118 "Unexpected main/alternate opcode");
15129 unsigned Opcode0 =
E->getOpcode();
15130 unsigned Opcode1 =
E->getAltOpcode();
15131 SmallBitVector OpcodeMask(
15135 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15137 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15138 return AltVecCost < VecCost ? AltVecCost : VecCost;
15144 return GetCostDiff(
15149 "Not supported shufflevector usage.");
15151 unsigned SVNumElements =
15153 ->getNumElements();
15154 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15155 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15160 "Not supported shufflevector usage.");
15163 [[maybe_unused]]
bool IsExtractSubvectorMask =
15164 SV->isExtractSubvectorMask(Index);
15165 assert(IsExtractSubvectorMask &&
15166 "Not supported shufflevector usage.");
15167 if (NextIndex != Index)
15169 NextIndex += SV->getShuffleMask().size();
15172 return ::getShuffleCost(
15178 return GetCostDiff(GetScalarCost, GetVectorCost);
15180 case Instruction::Freeze:
15187bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15189 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15191 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15192 SmallVector<int>
Mask;
15193 return TE->isGather() &&
15195 [
this](
Value *V) { return EphValues.contains(V); }) &&
15197 TE->Scalars.size() < Limit ||
15198 (((
TE->hasState() &&
15199 TE->getOpcode() == Instruction::ExtractElement) ||
15202 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15203 !
TE->isAltShuffle()) ||
15208 if (VectorizableTree.size() == 1 &&
15209 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15210 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15211 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15213 AreVectorizableGathers(VectorizableTree[0].
get(),
15214 VectorizableTree[0]->Scalars.size()) &&
15215 VectorizableTree[0]->getVectorFactor() > 2)))
15218 if (VectorizableTree.size() != 2)
15225 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15226 AreVectorizableGathers(VectorizableTree[1].
get(),
15227 VectorizableTree[0]->Scalars.size()))
15231 if (VectorizableTree[0]->
isGather() ||
15232 (VectorizableTree[1]->
isGather() &&
15233 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15234 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15235 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15243 bool MustMatchOrInst) {
15247 Value *ZextLoad = Root;
15248 const APInt *ShAmtC;
15249 bool FoundOr =
false;
15253 ShAmtC->
urem(8) == 0))) {
15255 ZextLoad = BinOp->getOperand(0);
15256 if (BinOp->getOpcode() == Instruction::Or)
15261 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15268 Type *SrcTy = Load->getType();
15269 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15275 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15285 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15286 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15294 unsigned NumElts = Stores.
size();
15295 for (
Value *Scalar : Stores) {
15309 if (VectorizableTree.empty()) {
15310 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15316 if (VectorizableTree.size() == 2 &&
15318 VectorizableTree[1]->isGather() &&
15319 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15320 !(
isSplat(VectorizableTree[1]->Scalars) ||
15328 constexpr int Limit = 4;
15330 !VectorizableTree.empty() &&
15331 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15332 return (TE->isGather() &&
15333 (!TE->hasState() ||
15334 TE->getOpcode() != Instruction::ExtractElement) &&
15336 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15343 VectorizableTree.size() <= Limit &&
15344 all_of(VectorizableTree,
15345 [&](
const std::unique_ptr<TreeEntry> &TE) {
15346 return (TE->isGather() &&
15347 (!TE->hasState() ||
15348 TE->getOpcode() != Instruction::ExtractElement) &&
15352 (TE->getOpcode() == Instruction::InsertElement ||
15353 (TE->getOpcode() == Instruction::PHI &&
15355 return isa<PoisonValue>(V) || MustGather.contains(V);
15358 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15359 return TE->State == TreeEntry::Vectorize &&
15360 TE->getOpcode() == Instruction::PHI;
15367 unsigned NumGathers = 0;
15368 constexpr int LimitTreeSize = 36;
15370 all_of(VectorizableTree,
15371 [&](
const std::unique_ptr<TreeEntry> &TE) {
15372 if (!TE->isGather() && TE->hasState() &&
15373 (TE->getOpcode() == Instruction::Load ||
15374 TE->getOpcode() == Instruction::Store)) {
15378 if (TE->isGather())
15380 return TE->State == TreeEntry::SplitVectorize ||
15381 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15382 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15383 VectorizableTree.size() > LimitTreeSize) ||
15387 (TE->getOpcode() == Instruction::PHI ||
15388 (TE->hasCopyableElements() &&
15391 TE->Scalars.size() / 2) ||
15392 ((!TE->ReuseShuffleIndices.empty() ||
15393 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15394 TE->Scalars.size() == 2)));
15396 (StoreLoadNodes.
empty() ||
15397 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15398 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15399 return TE->getOpcode() == Instruction::Store ||
15401 return !isa<LoadInst>(V) ||
15402 areAllUsersVectorized(cast<Instruction>(V));
15410 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15411 VectorizableTree.size() >= Limit &&
15413 [&](
const std::unique_ptr<TreeEntry> &TE) {
15414 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15415 TE->UserTreeIndex.UserTE->Idx == 0;
15422 VectorizableTree.size() > 2 &&
15423 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15424 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15425 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15426 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15428 ArrayRef(VectorizableTree).drop_front(2),
15429 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15439 if (isFullyVectorizableTinyTree(ForReduction))
15444 bool IsAllowedSingleBVNode =
15445 VectorizableTree.
size() > 1 ||
15446 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15447 !VectorizableTree.front()->isAltShuffle() &&
15448 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15449 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15451 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15452 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15453 return isa<ExtractElementInst, Constant>(V) ||
15454 (IsAllowedSingleBVNode &&
15455 !V->hasNUsesOrMore(UsesLimit) &&
15456 any_of(V->users(), IsaPred<InsertElementInst>));
15461 if (VectorizableTree.back()->isGather() &&
15462 VectorizableTree.back()->hasState() &&
15463 VectorizableTree.back()->isAltShuffle() &&
15464 VectorizableTree.back()->getVectorFactor() > 2 &&
15466 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15467 TTI->getScalarizationOverhead(
15468 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15469 VectorizableTree.back()->getVectorFactor()),
15482 constexpr unsigned SmallTree = 3;
15483 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15486 [](
const std::unique_ptr<TreeEntry> &TE) {
15487 return TE->isGather() && TE->hasState() &&
15488 TE->getOpcode() == Instruction::Load &&
15496 TreeEntry &E = *VectorizableTree[Idx];
15497 if (E.State == TreeEntry::SplitVectorize)
15501 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15520 const TreeEntry *Root = VectorizableTree.front().get();
15521 if (Root->isGather())
15529 for (
const auto &TEPtr : VectorizableTree) {
15530 if (!TEPtr->isGather()) {
15531 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15532 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15533 LastInstructions.
insert(LastInst);
15535 if (TEPtr->UserTreeIndex)
15536 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15543 if (
II->isAssumeLikeIntrinsic())
15550 return IntrCost < CallCost;
15557 CheckedInstructions;
15558 unsigned Budget = 0;
15559 const unsigned BudgetLimit =
15564 "Expected instructions in same block.");
15565 if (
auto It = CheckedInstructions.
find(
Last);
15566 It != CheckedInstructions.
end()) {
15567 const Instruction *Checked = It->second.getPointer();
15569 return It->second.getInt() != 0;
15575 ++
First->getIterator().getReverse(),
15577 Last->getIterator().getReverse();
15579 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15585 for (
const Instruction *LastInst : LastInstsInRange)
15586 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15589 if (LastInstructions.
contains(&*PrevInstIt))
15590 LastInstsInRange.
push_back(&*PrevInstIt);
15595 for (
const Instruction *LastInst : LastInstsInRange)
15597 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15598 Budget <= BudgetLimit ? 1 : 0);
15599 return Budget <= BudgetLimit;
15601 auto AddCosts = [&](
const TreeEntry *
Op) {
15602 Type *ScalarTy =
Op->Scalars.front()->getType();
15603 auto It = MinBWs.find(
Op);
15604 if (It != MinBWs.end())
15607 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15610 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15617 ParentOpParentToPreds;
15620 auto Key = std::make_pair(Root, OpParent);
15621 if (
auto It = ParentOpParentToPreds.
find(
Key);
15622 It != ParentOpParentToPreds.
end())
15634 for (
const auto &KeyPair : ParentsPairsToAdd) {
15636 "Should not have been added before.");
15640 while (!Worklist.
empty()) {
15642 if (BB == OpParent || !Visited.
insert(BB).second)
15644 auto Pair = std::make_pair(BB, OpParent);
15645 if (
auto It = ParentOpParentToPreds.
find(Pair);
15646 It != ParentOpParentToPreds.
end()) {
15650 ParentsPairsToAdd.
insert(Pair);
15655 if (Budget > BudgetLimit)
15667 while (!LiveEntries.
empty()) {
15672 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15675 if (!
Op->isGather())
15677 if (Entry->State == TreeEntry::SplitVectorize ||
15678 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15684 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15687 if (
Op->isGather()) {
15688 assert(Entry->getOpcode() == Instruction::PHI &&
15689 "Expected phi node only.");
15691 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15693 for (
Value *V :
Op->Scalars) {
15704 OpLastInst = EntriesToLastInstruction.
at(
Op);
15708 if (OpParent == Parent) {
15709 if (Entry->getOpcode() == Instruction::PHI) {
15710 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15714 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15720 if (Entry->getOpcode() != Instruction::PHI &&
15721 !CheckForNonVecCallsInSameBlock(
15722 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
15728 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15734 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15750 const auto *I1 = IE1;
15751 const auto *I2 = IE2;
15763 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15766 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
15769 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15776struct ValueSelect {
15777 template <
typename U>
15778 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
15781 template <
typename U>
15782 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
15800template <
typename T>
15806 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
15808 auto VMIt = std::next(ShuffleMask.begin());
15811 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15813 if (!IsBaseUndef.
all()) {
15815 std::pair<T *, bool> Res =
15816 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
15818 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15822 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15824 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
15825 assert((!V || GetVF(V) == Mask.size()) &&
15826 "Expected base vector of VF number of elements.");
15827 Prev = Action(Mask, {
nullptr, Res.first});
15828 }
else if (ShuffleMask.size() == 1) {
15831 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15837 Prev = Action(Mask, {ShuffleMask.begin()->first});
15841 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15842 unsigned Vec2VF = GetVF(VMIt->first);
15843 if (Vec1VF == Vec2VF) {
15847 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15850 Mask[
I] = SecMask[
I] + Vec1VF;
15853 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15856 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15858 std::pair<T *, bool> Res2 =
15859 ResizeAction(VMIt->first, VMIt->second,
false);
15861 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15868 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
15871 Prev = Action(Mask, {Res1.first, Res2.first});
15873 VMIt = std::next(VMIt);
15875 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
15877 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
15879 std::pair<T *, bool> Res =
15880 ResizeAction(VMIt->first, VMIt->second,
false);
15882 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15885 "Multiple uses of scalars.");
15886 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
15891 Prev = Action(Mask, {Prev, Res.first});
15899template <
typename T>
struct ShuffledInsertData {
15903 MapVector<T, SmallVector<int>> ValueMasks;
15911 << VectorizableTree.size() <<
".\n");
15914 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
15915 TreeEntry &TE = *VectorizableTree[
I];
15918 if (TE.State == TreeEntry::CombinedVectorize) {
15920 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
15921 << *TE.Scalars[0] <<
".\n";
15922 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15925 if (TE.hasState() &&
15926 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15927 if (
const TreeEntry *E =
15928 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15929 E && E->getVectorFactor() == TE.getVectorFactor()) {
15934 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15941 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15942 "Expected gather nodes with users only.");
15948 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
15952 none_of(ExternalUses, [](
const ExternalUser &EU) {
15963 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
15970 for (ExternalUser &EU : ExternalUses) {
15971 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
15974 for (ExternalUser &EU : ExternalUses) {
15975 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
15976 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
15978 else dbgs() <<
" User: nullptr\n");
15979 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
15984 if (EphValues.count(EU.User))
15988 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
15990 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
15998 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16004 !ExtractCostCalculated.
insert(EU.Scalar).second)
16017 if (!UsedInserts.
insert(VU).second)
16021 const TreeEntry *ScalarTE = &EU.E;
16024 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16029 Value *Op0 =
II->getOperand(0);
16036 if (It == ShuffledInserts.
end()) {
16038 Data.InsertElements.emplace_back(VU);
16040 VecId = ShuffledInserts.
size() - 1;
16041 auto It = MinBWs.find(ScalarTE);
16042 if (It != MinBWs.end() &&
16044 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16046 unsigned BWSz = It->second.first;
16047 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16048 unsigned VecOpcode;
16049 if (DstBWSz < BWSz)
16050 VecOpcode = Instruction::Trunc;
16053 It->second.second ? Instruction::SExt : Instruction::ZExt;
16058 FTy->getNumElements()),
16061 <<
" for extending externally used vector with "
16062 "non-equal minimum bitwidth.\n");
16067 It->InsertElements.front() = VU;
16068 VecId = std::distance(ShuffledInserts.
begin(), It);
16070 int InIdx = *InsertIdx;
16072 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16075 Mask[InIdx] = EU.Lane;
16076 DemandedElts[VecId].setBit(InIdx);
16087 auto *ScalarTy = EU.Scalar->getType();
16088 const unsigned BundleWidth = EU.E.getVectorFactor();
16089 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16091 const TreeEntry *Entry = &EU.E;
16092 auto It = MinBWs.find(Entry);
16093 if (It != MinBWs.end()) {
16098 ? Instruction::ZExt
16099 : Instruction::SExt;
16104 << ExtraCost <<
"\n");
16108 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16109 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16110 << *VecTy <<
": " << ExtraCost <<
"\n");
16113 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16114 Entry->getOpcode() == Instruction::Load) {
16116 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16119 const Loop *L = LI->getLoopFor(Phi->getParent());
16120 return L && (Phi->getParent() ==
I->getParent() ||
16121 L == LI->getLoopFor(
I->getParent()));
16125 if (!ValueToExtUses) {
16126 ValueToExtUses.emplace();
16127 for (
const auto &
P :
enumerate(ExternalUses)) {
16129 if (IsPhiInLoop(
P.value()))
16132 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16139 auto OperandIsScalar = [&](
Value *V) {
16145 return !EE->hasOneUse() || !MustGather.contains(EE);
16148 return ValueToExtUses->contains(V);
16150 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16151 bool CanBeUsedAsScalarCast =
false;
16154 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16159 if (ScalarCost + OpCost <= ExtraCost) {
16160 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16161 ScalarCost += OpCost;
16165 if (CanBeUsedAsScalar) {
16166 bool KeepScalar = ScalarCost <= ExtraCost;
16170 bool IsProfitablePHIUser =
16172 VectorizableTree.front()->Scalars.size() > 2)) &&
16173 VectorizableTree.front()->hasState() &&
16174 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16178 auto *PHIUser = dyn_cast<PHINode>(U);
16179 return (!PHIUser ||
16180 PHIUser->getParent() !=
16182 VectorizableTree.front()->getMainOp())
16187 return ValueToExtUses->contains(V);
16189 if (IsProfitablePHIUser) {
16193 (!GatheredLoadsEntriesFirst.has_value() ||
16194 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16195 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16196 return ValueToExtUses->contains(V);
16198 auto It = ExtractsCount.
find(Entry);
16199 if (It != ExtractsCount.
end()) {
16200 assert(ScalarUsesCount >= It->getSecond().size() &&
16201 "Expected total number of external uses not less than "
16202 "number of scalar uses.");
16203 ScalarUsesCount -= It->getSecond().size();
16208 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16211 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16212 for (
Value *V : Inst->operands()) {
16213 auto It = ValueToExtUses->find(V);
16214 if (It != ValueToExtUses->end()) {
16216 ExternalUses[It->second].User =
nullptr;
16219 ExtraCost = ScalarCost;
16220 if (!IsPhiInLoop(EU))
16221 ExtractsCount[Entry].
insert(Inst);
16222 if (CanBeUsedAsScalarCast) {
16223 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16227 for (
Value *V : IOp->operands()) {
16228 auto It = ValueToExtUses->find(V);
16229 if (It != ValueToExtUses->end()) {
16231 ExternalUses[It->second].User =
nullptr;
16240 ExtractCost += ExtraCost;
16244 for (
Value *V : ScalarOpsFromCasts) {
16245 ExternalUsesAsOriginalScalar.insert(V);
16247 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16248 TEs.front()->findLaneForValue(V));
16252 if (!VectorizedVals.
empty()) {
16253 const TreeEntry &Root = *VectorizableTree.front();
16254 auto BWIt = MinBWs.find(&Root);
16255 if (BWIt != MinBWs.end()) {
16256 Type *DstTy = Root.Scalars.front()->getType();
16257 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
16259 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16260 if (OriginalSz != SrcSz) {
16261 unsigned Opcode = Instruction::Trunc;
16262 if (OriginalSz > SrcSz)
16263 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16269 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16276 Cost += ExtractCost;
16278 bool ForSingleMask) {
16280 unsigned VF = Mask.size();
16281 unsigned VecVF = TE->getVectorFactor();
16282 bool HasLargeIndex =
16283 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16284 if ((VF != VecVF && HasLargeIndex) ||
16287 if (HasLargeIndex) {
16289 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16295 dbgs() <<
"SLP: Adding cost " <<
C
16296 <<
" for final shuffle of insertelement external users.\n";
16297 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16299 return std::make_pair(TE,
true);
16302 if (!ForSingleMask) {
16304 for (
unsigned I = 0;
I < VF; ++
I) {
16306 ResizeMask[Mask[
I]] = Mask[
I];
16313 dbgs() <<
"SLP: Adding cost " <<
C
16314 <<
" for final shuffle of insertelement external users.\n";
16315 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16320 return std::make_pair(TE,
false);
16323 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16324 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16325 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16329 assert((TEs.size() == 1 || TEs.size() == 2) &&
16330 "Expected exactly 1 or 2 tree entries.");
16331 if (TEs.size() == 1) {
16333 VF = TEs.front()->getVectorFactor();
16334 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16338 (
Data.index() < VF &&
16339 static_cast<int>(
Data.index()) ==
Data.value());
16344 <<
" for final shuffle of insertelement "
16345 "external users.\n";
16346 TEs.front()->
dump();
16347 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16353 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16354 VF = TEs.front()->getVectorFactor();
16358 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16362 <<
" for final shuffle of vector node and external "
16363 "insertelement users.\n";
16364 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16365 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16373 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16374 EstimateShufflesCost);
16377 ShuffledInserts[
I].InsertElements.
front()->getType()),
16380 Cost -= InsertCost;
16384 if (ReductionBitWidth != 0) {
16385 assert(UserIgnoreList &&
"Expected reduction tree.");
16386 const TreeEntry &E = *VectorizableTree.front();
16387 auto It = MinBWs.find(&E);
16388 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16389 unsigned SrcSize = It->second.first;
16390 unsigned DstSize = ReductionBitWidth;
16391 unsigned Opcode = Instruction::Trunc;
16392 if (SrcSize < DstSize) {
16393 bool IsArithmeticExtendedReduction =
16396 return is_contained({Instruction::Add, Instruction::FAdd,
16397 Instruction::Mul, Instruction::FMul,
16398 Instruction::And, Instruction::Or,
16402 if (IsArithmeticExtendedReduction)
16404 Instruction::BitCast;
16406 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16408 if (Opcode != Instruction::BitCast) {
16410 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16412 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16415 switch (E.getOpcode()) {
16416 case Instruction::SExt:
16417 case Instruction::ZExt:
16418 case Instruction::Trunc: {
16419 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16420 CCH = getCastContextHint(*OpTE);
16426 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16430 <<
" for final resize for reduction from " << SrcVecTy
16431 <<
" to " << DstVecTy <<
"\n";
16432 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16437 std::optional<InstructionCost> SpillCost;
16440 Cost += *SpillCost;
16446 OS <<
"SLP: Spill Cost = ";
16451 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16452 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
16456 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
16467std::optional<TTI::ShuffleKind>
16468BoUpSLP::tryToGatherSingleRegisterExtractElements(
16474 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16490 if (Idx >= VecTy->getNumElements()) {
16494 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
16495 ExtractMask.reset(*Idx);
16500 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16505 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16506 return P1.second.size() > P2.second.size();
16509 const int UndefSz = UndefVectorExtracts.
size();
16510 unsigned SingleMax = 0;
16511 unsigned PairMax = 0;
16512 if (!Vectors.
empty()) {
16513 SingleMax = Vectors.
front().second.size() + UndefSz;
16514 if (Vectors.
size() > 1) {
16515 auto *ItNext = std::next(Vectors.
begin());
16516 PairMax = SingleMax + ItNext->second.size();
16519 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16520 return std::nullopt;
16526 if (SingleMax >= PairMax && SingleMax) {
16527 for (
int Idx : Vectors.
front().second)
16528 std::swap(GatheredExtracts[Idx], VL[Idx]);
16529 }
else if (!Vectors.
empty()) {
16530 for (
unsigned Idx : {0, 1})
16531 for (
int Idx : Vectors[Idx].second)
16532 std::swap(GatheredExtracts[Idx], VL[Idx]);
16535 for (
int Idx : UndefVectorExtracts)
16536 std::swap(GatheredExtracts[Idx], VL[Idx]);
16539 std::optional<TTI::ShuffleKind> Res =
16545 return std::nullopt;
16549 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
16570BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16571 SmallVectorImpl<int> &Mask,
16572 unsigned NumParts)
const {
16573 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16582 SmallVector<int> SubMask;
16583 std::optional<TTI::ShuffleKind> Res =
16584 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16585 ShufflesRes[Part] = Res;
16586 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16588 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16589 return Res.has_value();
16591 ShufflesRes.clear();
16592 return ShufflesRes;
16595std::optional<TargetTransformInfo::ShuffleKind>
16596BoUpSLP::isGatherShuffledSingleRegisterEntry(
16598 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
16602 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16603 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16604 TE =
TE->UserTreeIndex.UserTE;
16605 if (TE == VectorizableTree.front().get())
16606 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16607 return TE->UserTreeIndex;
16609 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16610 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16611 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16613 TE =
TE->UserTreeIndex.UserTE;
16617 const EdgeInfo TEUseEI = GetUserEntry(TE);
16619 return std::nullopt;
16620 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16625 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16626 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16627 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16630 TEInsertBlock = TEInsertPt->
getParent();
16632 if (!DT->isReachableFromEntry(TEInsertBlock))
16633 return std::nullopt;
16634 auto *NodeUI = DT->getNode(TEInsertBlock);
16635 assert(NodeUI &&
"Should only process reachable instructions");
16637 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16650 const BasicBlock *InsertBlock = InsertPt->getParent();
16651 auto *NodeEUI = DT->getNode(InsertBlock);
16654 assert((NodeUI == NodeEUI) ==
16655 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16656 "Different nodes should have different DFS numbers");
16658 if (TEInsertPt->
getParent() != InsertBlock &&
16659 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16661 if (TEInsertPt->
getParent() == InsertBlock &&
16674 SmallDenseMap<Value *, int> UsedValuesEntry;
16675 SmallPtrSet<const Value *, 16> VisitedValue;
16676 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16678 if ((TEPtr->getVectorFactor() != VL.
size() &&
16679 TEPtr->Scalars.size() != VL.
size()) ||
16680 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16684 for (
Value *V : VL) {
16691 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16692 unsigned EdgeIdx) {
16693 const TreeEntry *Ptr1 = User1;
16694 const TreeEntry *Ptr2 = User2;
16695 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16698 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16699 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16702 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16703 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16704 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16705 return Idx < It->second;
16709 for (
Value *V : VL) {
16713 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16714 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16715 if (TEPtr == TE || TEPtr->Idx == 0)
16718 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
16719 "Must contain at least single gathered value.");
16720 assert(TEPtr->UserTreeIndex &&
16721 "Expected only single user of a gather node.");
16722 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16724 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16725 UseEI.UserTE->hasState())
16730 : &getLastInstructionInBundle(UseEI.UserTE);
16731 if (TEInsertPt == InsertPt) {
16733 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16734 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16735 TEUseEI.UserTE->isAltShuffle()) &&
16737 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16738 (UseEI.UserTE->hasState() &&
16739 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16740 !UseEI.UserTE->isAltShuffle()) ||
16749 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16752 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16753 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16754 UseEI.UserTE->State == TreeEntry::Vectorize &&
16755 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16756 TEUseEI.UserTE != UseEI.UserTE)
16761 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16765 if (TEUseEI.UserTE != UseEI.UserTE &&
16766 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16767 HasGatherUser(TEUseEI.UserTE)))
16770 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16774 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16775 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16776 UseEI.UserTE->doesNotNeedToSchedule() &&
16781 if ((TEInsertBlock != InsertPt->
getParent() ||
16782 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16783 !CheckOrdering(InsertPt))
16786 if (CheckAndUseSameNode(TEPtr))
16792 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
16793 if (It != VTEs.end()) {
16794 const TreeEntry *VTE = *It;
16795 if (
none_of(
TE->CombinedEntriesWithIndices,
16796 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16797 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16798 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16802 if (CheckAndUseSameNode(VTE))
16808 const TreeEntry *VTE = VTEs.front();
16809 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16810 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16811 VTEs = VTEs.drop_front();
16813 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
16814 return MTE->State == TreeEntry::Vectorize;
16816 if (MIt == VTEs.end())
16820 if (
none_of(
TE->CombinedEntriesWithIndices,
16821 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16822 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16823 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16827 if (CheckAndUseSameNode(VTE))
16831 if (VToTEs.
empty())
16833 if (UsedTEs.
empty()) {
16841 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16843 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16847 if (!VToTEs.
empty()) {
16853 VToTEs = SavedVToTEs;
16858 if (Idx == UsedTEs.
size()) {
16862 if (UsedTEs.
size() == 2)
16864 UsedTEs.push_back(SavedVToTEs);
16865 Idx = UsedTEs.
size() - 1;
16871 if (UsedTEs.
empty()) {
16873 return std::nullopt;
16877 if (UsedTEs.
size() == 1) {
16880 UsedTEs.front().
end());
16881 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16882 return TE1->Idx < TE2->Idx;
16885 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
16886 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
16888 if (It != FirstEntries.end() &&
16889 ((*It)->getVectorFactor() == VL.size() ||
16890 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
16891 TE->ReuseShuffleIndices.size() == VL.size() &&
16892 (*It)->isSame(
TE->Scalars)))) {
16894 if ((*It)->getVectorFactor() == VL.size()) {
16895 std::iota(std::next(
Mask.begin(), Part * VL.size()),
16896 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
16898 SmallVector<int> CommonMask =
TE->getCommonMask();
16909 Entries.
push_back(FirstEntries.front());
16911 for (
auto &
P : UsedValuesEntry)
16913 VF = FirstEntries.front()->getVectorFactor();
16916 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
16918 DenseMap<int, const TreeEntry *> VFToTE;
16919 for (
const TreeEntry *TE : UsedTEs.front()) {
16920 unsigned VF =
TE->getVectorFactor();
16921 auto It = VFToTE.
find(VF);
16922 if (It != VFToTE.
end()) {
16923 if (It->second->Idx >
TE->Idx)
16924 It->getSecond() =
TE;
16931 UsedTEs.back().
end());
16932 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16933 return TE1->Idx < TE2->Idx;
16935 for (
const TreeEntry *TE : SecondEntries) {
16936 auto It = VFToTE.
find(
TE->getVectorFactor());
16937 if (It != VFToTE.
end()) {
16946 if (Entries.
empty()) {
16948 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
16949 return TE1->Idx < TE2->Idx;
16951 Entries.
push_back(SecondEntries.front());
16952 VF = std::max(Entries.
front()->getVectorFactor(),
16953 Entries.
back()->getVectorFactor());
16955 VF = Entries.
front()->getVectorFactor();
16958 for (
const TreeEntry *
E : Entries)
16962 for (
auto &
P : UsedValuesEntry) {
16964 if (ValuesToEntries[Idx].
contains(
P.first)) {
16974 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
16981 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
16983 Value *In1 = PHI1->getIncomingValue(
I);
16998 auto MightBeIgnored = [=](
Value *
V) {
17002 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17007 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17008 Value *V1 = VL[Idx];
17009 bool UsedInSameVTE =
false;
17010 auto It = UsedValuesEntry.find(V1);
17011 if (It != UsedValuesEntry.end())
17012 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17013 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17020 SmallBitVector UsedIdxs(Entries.size());
17022 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17024 auto It = UsedValuesEntry.find(V);
17025 if (It == UsedValuesEntry.end())
17031 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17032 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17034 unsigned Idx = It->second;
17041 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17042 if (!UsedIdxs.test(
I))
17048 for (std::pair<unsigned, int> &Pair : EntryLanes)
17049 if (Pair.first ==
I)
17050 Pair.first = TempEntries.
size();
17053 Entries.swap(TempEntries);
17054 if (EntryLanes.size() == Entries.size() &&
17056 .slice(Part * VL.size(),
17057 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17063 return std::nullopt;
17066 bool IsIdentity = Entries.size() == 1;
17069 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17070 unsigned Idx = Part * VL.size() + Pair.second;
17073 (ForOrder ? std::distance(
17074 Entries[Pair.first]->Scalars.begin(),
17075 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17076 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17077 IsIdentity &=
Mask[Idx] == Pair.second;
17079 if (ForOrder || IsIdentity || Entries.empty()) {
17080 switch (Entries.size()) {
17082 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17086 if (EntryLanes.size() > 2 || VL.size() <= 2)
17093 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17095 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17096 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17097 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17098 for (
int Idx : SubMask) {
17106 assert(MaxElement >= 0 && MinElement >= 0 &&
17107 MaxElement % VF >= MinElement % VF &&
17108 "Expected at least single element.");
17109 unsigned NewVF = std::max<unsigned>(
17111 (MaxElement % VF) -
17112 (MinElement % VF) + 1));
17114 for (
int &Idx : SubMask) {
17117 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17118 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17126 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17127 auto GetShuffleCost = [&,
17128 &TTI = *TTI](ArrayRef<int>
Mask,
17131 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17133 Mask, Entries.front()->getInterleaveFactor()))
17135 return ::getShuffleCost(TTI,
17140 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17142 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17143 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17144 FirstShuffleCost = ShuffleCost;
17148 bool IsIdentity =
true;
17149 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17150 if (Idx >=
static_cast<int>(NewVF)) {
17155 IsIdentity &=
static_cast<int>(
I) == Idx;
17159 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17161 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17165 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17166 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17167 SecondShuffleCost = ShuffleCost;
17171 bool IsIdentity =
true;
17172 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
17173 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
17179 IsIdentity &=
static_cast<int>(
I) == Idx;
17184 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17186 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17194 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17196 const TreeEntry *BestEntry =
nullptr;
17197 if (FirstShuffleCost < ShuffleCost) {
17198 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17199 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17201 if (Idx >= static_cast<int>(VF))
17202 Idx = PoisonMaskElem;
17204 BestEntry = Entries.front();
17205 ShuffleCost = FirstShuffleCost;
17207 if (SecondShuffleCost < ShuffleCost) {
17208 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17209 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17211 if (Idx < static_cast<int>(VF))
17212 Idx = PoisonMaskElem;
17216 BestEntry = Entries[1];
17217 ShuffleCost = SecondShuffleCost;
17219 if (BuildVectorCost >= ShuffleCost) {
17222 Entries.push_back(BestEntry);
17230 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17232 return std::nullopt;
17236BoUpSLP::isGatherShuffledEntry(
17240 assert(NumParts > 0 && NumParts < VL.
size() &&
17241 "Expected positive number of registers.");
17244 if (TE == VectorizableTree.front().get() &&
17245 (!GatheredLoadsEntriesFirst.has_value() ||
17247 [](
const std::unique_ptr<TreeEntry> &TE) {
17248 return !
TE->isGather();
17253 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17256 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17257 "Expected only single user of the gather node.");
17259 "Number of scalars must be divisible by NumParts.");
17260 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17261 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17263 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17266 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17273 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17274 std::optional<TTI::ShuffleKind> SubRes =
17275 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17278 SubEntries.
clear();
17281 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17282 (SubEntries.
front()->isSame(
TE->Scalars) ||
17283 SubEntries.
front()->isSame(VL))) {
17285 LocalSubEntries.
swap(SubEntries);
17288 std::iota(
Mask.begin(),
Mask.end(), 0);
17290 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17293 Entries.emplace_back(1, LocalSubEntries.
front());
17299 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17307 Type *ScalarTy)
const {
17308 const unsigned VF = VL.
size();
17316 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17318 if (
V->getType() != ScalarTy)
17319 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
17323 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17330 ConstantShuffleMask[
I] =
I + VF;
17333 EstimateInsertCost(
I, V);
17336 bool IsAnyNonUndefConst =
17339 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17341 ConstantShuffleMask);
17345 if (!DemandedElements.
isZero())
17349 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17353Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
17354 auto It = EntryToLastInstruction.find(
E);
17355 if (It != EntryToLastInstruction.end())
17363 if (
E->hasState()) {
17364 Front =
E->getMainOp();
17365 Opcode =
E->getOpcode();
17372 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17373 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
17374 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
17376 [=](
Value *V) ->
bool {
17377 if (Opcode == Instruction::GetElementPtr &&
17378 !isa<GetElementPtrInst>(V))
17380 auto *I = dyn_cast<Instruction>(V);
17381 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17382 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17384 "Expected gathered loads or GEPs or instructions from same basic "
17387 auto FindLastInst = [&]() {
17389 for (
Value *V :
E->Scalars) {
17393 if (
E->isCopyableElement(
I))
17395 if (LastInst->
getParent() ==
I->getParent()) {
17400 assert(((Opcode == Instruction::GetElementPtr &&
17402 E->State == TreeEntry::SplitVectorize ||
17405 (GatheredLoadsEntriesFirst.has_value() &&
17406 Opcode == Instruction::Load &&
E->isGather() &&
17407 E->Idx < *GatheredLoadsEntriesFirst)) &&
17408 "Expected vector-like or non-GEP in GEP node insts only.");
17409 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
17413 if (!DT->isReachableFromEntry(
I->getParent()))
17415 auto *NodeA = DT->getNode(LastInst->
getParent());
17416 auto *NodeB = DT->getNode(
I->getParent());
17417 assert(NodeA &&
"Should only process reachable instructions");
17418 assert(NodeB &&
"Should only process reachable instructions");
17419 assert((NodeA == NodeB) ==
17420 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17421 "Different nodes should have different DFS numbers");
17422 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17429 auto FindFirstInst = [&]() {
17431 for (
Value *V :
E->Scalars) {
17435 if (
E->isCopyableElement(
I))
17437 if (FirstInst->
getParent() ==
I->getParent()) {
17438 if (
I->comesBefore(FirstInst))
17442 assert(((Opcode == Instruction::GetElementPtr &&
17446 "Expected vector-like or non-GEP in GEP node insts only.");
17447 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
17451 if (!DT->isReachableFromEntry(
I->getParent()))
17453 auto *NodeA = DT->getNode(FirstInst->
getParent());
17454 auto *NodeB = DT->getNode(
I->getParent());
17455 assert(NodeA &&
"Should only process reachable instructions");
17456 assert(NodeB &&
"Should only process reachable instructions");
17457 assert((NodeA == NodeB) ==
17458 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17459 "Different nodes should have different DFS numbers");
17460 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17466 if (
E->State == TreeEntry::SplitVectorize) {
17467 Res = FindLastInst();
17469 for (
auto *
E : Entries) {
17472 I = &getLastInstructionInBundle(
E);
17477 EntryToLastInstruction.try_emplace(
E, Res);
17482 if (GatheredLoadsEntriesFirst.has_value() &&
17483 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17484 Opcode == Instruction::Load) {
17485 Res = FindFirstInst();
17486 EntryToLastInstruction.try_emplace(
E, Res);
17492 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
17496 const auto *It = BlocksSchedules.find(BB);
17497 if (It == BlocksSchedules.end())
17499 for (
Value *V :
E->Scalars) {
17505 if (Bundles.
empty())
17508 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
17509 if (It != Bundles.
end())
17514 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
17515 if (!
E->isGather() && !Bundle) {
17516 if ((Opcode == Instruction::GetElementPtr &&
17519 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17522 return isa<PoisonValue>(V) || E->isCopyableElement(V) ||
17523 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17525 Res = FindLastInst();
17527 Res = FindFirstInst();
17528 EntryToLastInstruction.try_emplace(
E, Res);
17537 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
17538 Res = Bundle->getBundle().back()->getInst();
17539 EntryToLastInstruction.try_emplace(
E, Res);
17562 Res = FindLastInst();
17563 assert(Res &&
"Failed to find last instruction in bundle");
17564 EntryToLastInstruction.try_emplace(
E, Res);
17568void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
17569 auto *Front =
E->getMainOp();
17570 Instruction *LastInst = &getLastInstructionInBundle(
E);
17571 assert(LastInst &&
"Failed to find last instruction in bundle");
17576 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17577 if (LastInstIt != LastInst->
getParent()->end() &&
17578 LastInstIt->getParent()->isLandingPad())
17579 LastInstIt = std::next(LastInstIt);
17582 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
17583 E->doesNotNeedToSchedule()) ||
17584 (GatheredLoadsEntriesFirst.has_value() &&
17585 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17586 E->getOpcode() == Instruction::Load)) {
17587 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17591 Builder.SetInsertPoint(
17595 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17598Value *BoUpSLP::gather(
17600 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
17606 SmallSet<int, 4> PostponedIndices;
17607 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17609 SmallPtrSet<BasicBlock *, 4> Visited;
17610 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17611 InsertBB = InsertBB->getSinglePredecessor();
17612 return InsertBB && InsertBB == InstBB;
17614 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17616 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17618 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17619 PostponedIndices.
insert(
I).second)
17623 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17626 if (
Scalar->getType() != Ty) {
17637 Scalar = Builder.CreateIntCast(
17651 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17656 GatherShuffleExtractSeq.insert(InsElt);
17662 User *UserOp =
nullptr;
17667 if (
V->getType()->isVectorTy()) {
17669 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17671 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
17673 if (SV->getOperand(0) == V)
17675 if (SV->getOperand(1) == V)
17681 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17683 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17686 "Failed to find shufflevector, caused by resize.");
17692 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17693 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17701 SmallVector<int> NonConsts;
17703 std::iota(
Mask.begin(),
Mask.end(), 0);
17704 Value *OriginalRoot = Root;
17707 SV->getOperand(0)->getType() == VecTy) {
17708 Root = SV->getOperand(0);
17709 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17712 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17721 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17726 Vec = OriginalRoot;
17728 Vec = CreateShuffle(Root, Vec, Mask);
17730 OI && OI->use_empty() &&
17731 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17732 return TE->VectorizedValue == OI;
17738 for (
int I : NonConsts)
17739 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17742 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17743 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17781 bool IsFinalized =
false;
17794 class ShuffleIRBuilder {
17807 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17808 CSEBlocks(CSEBlocks),
DL(DL) {}
17809 ~ShuffleIRBuilder() =
default;
17815 "Expected integer vector types only.");
17821 ->getIntegerBitWidth())
17822 V2 = Builder.CreateIntCast(
17825 V1 = Builder.CreateIntCast(
17829 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17831 GatherShuffleExtractSeq.insert(
I);
17832 CSEBlocks.insert(
I->getParent());
17841 unsigned VF = Mask.size();
17845 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17847 GatherShuffleExtractSeq.insert(
I);
17848 CSEBlocks.insert(
I->getParent());
17852 Value *createIdentity(
Value *V) {
return V; }
17853 Value *createPoison(
Type *Ty,
unsigned VF) {
17858 void resizeToMatch(
Value *&V1,
Value *&V2) {
17863 int VF = std::max(V1VF, V2VF);
17864 int MinVF = std::min(V1VF, V2VF);
17866 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
17868 Value *&
Op = MinVF == V1VF ? V1 : V2;
17869 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
17871 GatherShuffleExtractSeq.insert(
I);
17872 CSEBlocks.insert(
I->getParent());
17885 assert(V1 &&
"Expected at least one vector value.");
17886 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17887 R.CSEBlocks, *R.DL);
17888 return BaseShuffleAnalysis::createShuffle<Value *>(
17889 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17895 std::optional<bool> IsSigned = std::nullopt) {
17898 if (VecTy->getElementType() == ScalarTy->getScalarType())
17900 return Builder.CreateIntCast(
17901 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
17905 Value *getVectorizedValue(
const TreeEntry &E) {
17906 Value *Vec = E.VectorizedValue;
17909 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
17910 return !isa<PoisonValue>(V) &&
17911 !isKnownNonNegative(
17912 V, SimplifyQuery(*R.DL));
17918 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17922 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17923 unsigned NumParts,
bool &UseVecBaseAsInput) {
17924 UseVecBaseAsInput =
false;
17926 Value *VecBase =
nullptr;
17928 if (!E->ReorderIndices.empty()) {
17930 E->ReorderIndices.end());
17933 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
17938 VecBase = EI->getVectorOperand();
17940 VecBase = TEs.front()->VectorizedValue;
17941 assert(VecBase &&
"Expected vectorized value.");
17942 UniqueBases.
insert(VecBase);
17945 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17946 (NumParts != 1 &&
count(VL, EI) > 1) ||
17948 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17949 return UTEs.empty() || UTEs.size() > 1 ||
17950 (isa<GetElementPtrInst>(U) &&
17951 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17953 count_if(R.VectorizableTree,
17954 [&](const std::unique_ptr<TreeEntry> &TE) {
17955 return TE->UserTreeIndex.UserTE ==
17957 is_contained(VL, EI);
17961 R.eraseInstruction(EI);
17963 if (NumParts == 1 || UniqueBases.
size() == 1) {
17964 assert(VecBase &&
"Expected vectorized value.");
17965 return castToScalarTyElem(VecBase);
17967 UseVecBaseAsInput =
true;
17977 Value *Vec =
nullptr;
17984 constexpr int MaxBases = 2;
17986 auto VLMask =
zip(SubVL, SubMask);
17987 const unsigned VF = std::accumulate(
17988 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
17989 if (std::get<1>(D) == PoisonMaskElem)
17992 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
17993 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
17995 VecOp = TEs.front()->VectorizedValue;
17996 assert(VecOp &&
"Expected vectorized value.");
17997 const unsigned Size =
17998 cast<FixedVectorType>(VecOp->getType())->getNumElements();
17999 return std::max(S, Size);
18001 for (
const auto [V,
I] : VLMask) {
18006 VecOp = TEs.front()->VectorizedValue;
18007 assert(VecOp &&
"Expected vectorized value.");
18008 VecOp = castToScalarTyElem(VecOp);
18009 Bases[
I / VF] = VecOp;
18011 if (!Bases.front())
18014 if (Bases.back()) {
18015 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18016 TransformToIdentity(SubMask);
18018 SubVec = Bases.front();
18024 ArrayRef<int> SubMask =
18025 Mask.slice(
P * SliceSize,
18028 return all_of(SubMask, [](
int Idx) {
18032 "Expected first part or all previous parts masked.");
18033 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18038 unsigned SubVecVF =
18040 NewVF = std::max(NewVF, SubVecVF);
18043 for (
int &Idx : SubMask)
18046 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18047 Vec = createShuffle(Vec, SubVec, VecMask);
18048 TransformToIdentity(VecMask);
18056 std::optional<Value *>
18062 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18064 return std::nullopt;
18067 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18068 return Builder.CreateAlignedLoad(
18075 IsFinalized =
false;
18076 CommonMask.clear();
18082 Value *V1 = getVectorizedValue(E1);
18083 Value *V2 = getVectorizedValue(E2);
18089 Value *V1 = getVectorizedValue(E1);
18094 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18097 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18098 V1 = castToScalarTyElem(V1);
18099 V2 = castToScalarTyElem(V2);
18100 if (InVectors.empty()) {
18101 InVectors.push_back(V1);
18102 InVectors.push_back(V2);
18103 CommonMask.assign(Mask.begin(), Mask.end());
18106 Value *Vec = InVectors.front();
18107 if (InVectors.size() == 2) {
18108 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18109 transformMaskAfterShuffle(CommonMask, CommonMask);
18112 Vec = createShuffle(Vec,
nullptr, CommonMask);
18113 transformMaskAfterShuffle(CommonMask, CommonMask);
18115 V1 = createShuffle(V1, V2, Mask);
18116 unsigned VF = std::max(getVF(V1), getVF(Vec));
18117 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18119 CommonMask[Idx] = Idx + VF;
18120 InVectors.front() = Vec;
18121 if (InVectors.size() == 2)
18122 InVectors.back() = V1;
18124 InVectors.push_back(V1);
18129 "castToScalarTyElem expects V1 to be FixedVectorType");
18130 V1 = castToScalarTyElem(V1);
18131 if (InVectors.empty()) {
18132 InVectors.push_back(V1);
18133 CommonMask.assign(Mask.begin(), Mask.end());
18136 const auto *It =
find(InVectors, V1);
18137 if (It == InVectors.end()) {
18138 if (InVectors.size() == 2 ||
18139 InVectors.front()->getType() != V1->
getType()) {
18140 Value *V = InVectors.front();
18141 if (InVectors.size() == 2) {
18142 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18143 transformMaskAfterShuffle(CommonMask, CommonMask);
18145 CommonMask.size()) {
18146 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
18147 transformMaskAfterShuffle(CommonMask, CommonMask);
18149 unsigned VF = std::max(CommonMask.size(), Mask.size());
18150 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18152 CommonMask[Idx] = V->getType() != V1->
getType()
18154 : Mask[Idx] + getVF(V1);
18155 if (V->getType() != V1->
getType())
18156 V1 = createShuffle(V1,
nullptr, Mask);
18157 InVectors.front() = V;
18158 if (InVectors.size() == 2)
18159 InVectors.back() = V1;
18161 InVectors.push_back(V1);
18166 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18168 InVectors.push_back(V1);
18173 for (
Value *V : InVectors)
18174 VF = std::max(VF, getVF(V));
18175 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18177 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18186 Value *Root =
nullptr) {
18187 return R.gather(VL, Root, ScalarTy,
18189 return createShuffle(V1, V2, Mask);
18198 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18203 IsFinalized =
true;
18206 if (InVectors.
size() == 2) {
18207 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18210 Vec = createShuffle(Vec,
nullptr, CommonMask);
18212 transformMaskAfterShuffle(CommonMask, CommonMask);
18214 "Expected vector length for the final value before action.");
18218 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18219 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18222 return createShuffle(V1, V2, Mask);
18224 InVectors.
front() = Vec;
18226 if (!SubVectors.empty()) {
18228 if (InVectors.
size() == 2) {
18229 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18232 Vec = createShuffle(Vec,
nullptr, CommonMask);
18234 transformMaskAfterShuffle(CommonMask, CommonMask);
18235 auto CreateSubVectors = [&](
Value *Vec,
18236 SmallVectorImpl<int> &CommonMask) {
18237 for (
auto [
E, Idx] : SubVectors) {
18238 Value *
V = getVectorizedValue(*
E);
18245 Type *OrigScalarTy = ScalarTy;
18248 Builder, Vec, V, InsertionIndex,
18249 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18251 ScalarTy = OrigScalarTy;
18252 if (!CommonMask.
empty()) {
18253 std::iota(std::next(CommonMask.
begin(), Idx),
18254 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
18260 if (SubVectorsMask.
empty()) {
18261 Vec = CreateSubVectors(Vec, CommonMask);
18264 copy(SubVectorsMask, SVMask.begin());
18265 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18268 I1 = I2 + CommonMask.
size();
18273 Vec = createShuffle(InsertVec, Vec, SVMask);
18274 transformMaskAfterShuffle(CommonMask, SVMask);
18276 InVectors.
front() = Vec;
18279 if (!ExtMask.
empty()) {
18280 if (CommonMask.
empty()) {
18284 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18287 NewMask[
I] = CommonMask[ExtMask[
I]];
18289 CommonMask.
swap(NewMask);
18292 if (CommonMask.
empty()) {
18293 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18294 return InVectors.
front();
18296 if (InVectors.
size() == 2)
18297 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18298 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18302 assert((IsFinalized || CommonMask.empty()) &&
18303 "Shuffle construction must be finalized.");
18307Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18311template <
typename BVTy,
typename ResTy,
typename... Args>
18312ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18314 assert(E->isGather() &&
"Expected gather node.");
18315 unsigned VF = E->getVectorFactor();
18317 bool NeedFreeze =
false;
18320 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18322 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18325 E->CombinedEntriesWithIndices.size());
18326 transform(
E->CombinedEntriesWithIndices, SubVectors.begin(),
18327 [&](
const auto &
P) {
18328 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18333 E->ReorderIndices.end());
18334 if (!ReorderMask.empty())
18340 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18342 if (
E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18345 SubVectorsMask.
clear();
18349 unsigned I,
unsigned SliceSize,
18350 bool IsNotPoisonous) {
18352 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18355 TreeEntry *UserTE =
E->UserTreeIndex.UserTE;
18356 unsigned EdgeIdx =
E->UserTreeIndex.EdgeIdx;
18357 if (UserTE->getNumOperands() != 2)
18359 if (!IsNotPoisonous) {
18360 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18361 [=](
const std::unique_ptr<TreeEntry> &TE) {
18362 return TE->UserTreeIndex.UserTE == UserTE &&
18363 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18365 if (It == VectorizableTree.end())
18368 if (!(*It)->ReorderIndices.empty()) {
18372 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18373 Value *V0 = std::get<0>(
P);
18374 Value *V1 = std::get<1>(
P);
18382 if ((
Mask.size() < InputVF &&
18385 (
Mask.size() == InputVF &&
18388 std::next(
Mask.begin(),
I * SliceSize),
18389 std::next(
Mask.begin(),
18396 std::next(
Mask.begin(),
I * SliceSize),
18397 std::next(
Mask.begin(),
18403 BVTy ShuffleBuilder(ScalarTy, Params...);
18404 ResTy Res = ResTy();
18405 SmallVector<int>
Mask;
18406 SmallVector<int> ExtractMask(GatheredScalars.size(),
PoisonMaskElem);
18408 Value *ExtractVecBase =
nullptr;
18409 bool UseVecBaseAsInput =
false;
18412 Type *OrigScalarTy = GatheredScalars.front()->getType();
18417 bool Resized =
false;
18419 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18420 if (!ExtractShuffles.
empty()) {
18422 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
18428 ExtractEntries.
append(TEs.begin(), TEs.end());
18430 if (std::optional<ResTy> Delayed =
18431 ShuffleBuilder.needToDelay(
E, ExtractEntries)) {
18433 PostponedGathers.insert(
E);
18438 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18439 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18440 ExtractVecBase = VecBase;
18442 if (VF == VecBaseTy->getNumElements() &&
18443 GatheredScalars.size() != VF) {
18445 GatheredScalars.append(VF - GatheredScalars.size(),
18453 if (!ExtractShuffles.
empty() || !
E->hasState() ||
18454 E->getOpcode() != Instruction::Load ||
18455 (((
E->hasState() &&
E->getOpcode() == Instruction::Load) ||
18459 return isa<LoadInst>(V) && isVectorized(V);
18461 (
E->hasState() &&
E->isAltShuffle()) ||
18462 all_of(
E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18464 (
E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18466 isGatherShuffledEntry(
E, GatheredScalars, Mask, Entries, NumParts);
18468 if (!GatherShuffles.
empty()) {
18469 if (std::optional<ResTy> Delayed =
18470 ShuffleBuilder.needToDelay(
E, Entries)) {
18472 PostponedGathers.insert(
E);
18477 if (GatherShuffles.
size() == 1 &&
18479 Entries.
front().front()->isSame(
E->Scalars)) {
18482 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18485 Mask.resize(
E->Scalars.size());
18486 const TreeEntry *FrontTE = Entries.
front().front();
18487 if (FrontTE->ReorderIndices.empty() &&
18488 ((FrontTE->ReuseShuffleIndices.empty() &&
18489 E->Scalars.size() == FrontTE->Scalars.size()) ||
18490 (
E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18491 std::iota(
Mask.begin(),
Mask.end(), 0);
18498 Mask[
I] = FrontTE->findLaneForValue(V);
18503 ShuffleBuilder.resetForSameNode();
18504 ShuffleBuilder.add(*FrontTE, Mask);
18506 Res = ShuffleBuilder.finalize(
E->getCommonMask(), {}, {});
18510 if (GatheredScalars.size() != VF &&
18512 return any_of(TEs, [&](
const TreeEntry *TE) {
18513 return TE->getVectorFactor() == VF;
18516 GatheredScalars.append(VF - GatheredScalars.size(),
18520 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
18526 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18527 SmallVectorImpl<int> &ReuseMask,
18528 bool IsRootPoison) {
18531 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18534 SmallVector<int> UndefPos;
18535 DenseMap<Value *, unsigned> UniquePositions;
18538 int NumNonConsts = 0;
18557 Scalars.
front() = OrigV;
18560 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18561 Scalars[Res.first->second] = OrigV;
18562 ReuseMask[
I] = Res.first->second;
18565 if (NumNonConsts == 1) {
18570 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18573 ReuseMask[SinglePos] = SinglePos;
18574 }
else if (!UndefPos.
empty() && IsSplat) {
18581 (
E->UserTreeIndex &&
any_of(
V->uses(), [
E](
const Use &U) {
18584 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18585 is_contained(E->UserTreeIndex.UserTE->Scalars,
18589 if (It != Scalars.
end()) {
18591 int Pos = std::distance(Scalars.
begin(), It);
18592 for (
int I : UndefPos) {
18594 ReuseMask[
I] = Pos;
18603 for (
int I : UndefPos) {
18612 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18613 bool IsNonPoisoned =
true;
18614 bool IsUsedInExpr =
true;
18615 Value *Vec1 =
nullptr;
18616 if (!ExtractShuffles.
empty()) {
18620 Value *Vec2 =
nullptr;
18621 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18625 if (UseVecBaseAsInput) {
18626 Vec1 = ExtractVecBase;
18628 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18634 Value *VecOp = EI->getVectorOperand();
18636 !TEs.
empty() && TEs.
front()->VectorizedValue)
18637 VecOp = TEs.
front()->VectorizedValue;
18640 }
else if (Vec1 != VecOp) {
18641 assert((!Vec2 || Vec2 == VecOp) &&
18642 "Expected only 1 or 2 vectors shuffle.");
18648 IsUsedInExpr =
false;
18651 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18654 IsUsedInExpr &= FindReusedSplat(
18657 ExtractMask.size(), IsNotPoisonedVec);
18658 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18659 IsNonPoisoned &= IsNotPoisonedVec;
18661 IsUsedInExpr =
false;
18666 if (!GatherShuffles.
empty()) {
18667 unsigned SliceSize =
18671 for (
const auto [
I, TEs] :
enumerate(Entries)) {
18674 "No shuffles with empty entries list expected.");
18678 "Expected shuffle of 1 or 2 entries.");
18682 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
18683 if (TEs.
size() == 1) {
18684 bool IsNotPoisonedVec =
18685 TEs.
front()->VectorizedValue
18689 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
18690 SliceSize, IsNotPoisonedVec);
18691 ShuffleBuilder.add(*TEs.
front(), VecMask);
18692 IsNonPoisoned &= IsNotPoisonedVec;
18694 IsUsedInExpr =
false;
18695 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
18696 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
18707 int EMSz = ExtractMask.size();
18708 int MSz =
Mask.size();
18711 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
18712 bool IsIdentityShuffle =
18713 ((UseVecBaseAsInput ||
18715 [](
const std::optional<TTI::ShuffleKind> &SK) {
18719 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
18721 (!GatherShuffles.
empty() &&
18723 [](
const std::optional<TTI::ShuffleKind> &SK) {
18727 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
18729 bool EnoughConstsForShuffle =
18739 (!IsIdentityShuffle ||
18740 (GatheredScalars.size() == 2 &&
18748 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
18749 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
18756 SmallVector<int> BVMask(GatheredScalars.size(),
PoisonMaskElem);
18757 TryPackScalars(GatheredScalars, BVMask,
true);
18758 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18759 ShuffleBuilder.add(BV, BVMask);
18763 (IsSingleShuffle && ((IsIdentityShuffle &&
18766 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18769 Res = ShuffleBuilder.finalize(
18770 E->ReuseShuffleIndices, SubVectors, SubVectorsMask,
E->Scalars.size(),
18771 [&](
Value *&Vec, SmallVectorImpl<int> &Mask,
auto CreateShuffle) {
18772 bool IsSplat = isSplat(NonConstants);
18773 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18774 TryPackScalars(NonConstants, BVMask, false);
18775 auto CheckIfSplatIsProfitable = [&]() {
18778 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18779 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18780 if (isa<ExtractElementInst>(V) || isVectorized(V))
18782 InstructionCost SplatCost = TTI->getVectorInstrCost(
18783 Instruction::InsertElement, VecTy, CostKind, 0,
18784 PoisonValue::get(VecTy), V);
18785 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18786 for (auto [Idx, I] : enumerate(BVMask))
18787 if (I != PoisonMaskElem)
18788 NewMask[Idx] = Mask.size();
18789 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18790 NewMask, CostKind);
18791 InstructionCost BVCost = TTI->getVectorInstrCost(
18792 Instruction::InsertElement, VecTy, CostKind,
18793 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18796 if (count(BVMask, PoisonMaskElem) <
18797 static_cast<int>(BVMask.size() - 1)) {
18798 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18799 for (auto [Idx, I] : enumerate(BVMask))
18800 if (I != PoisonMaskElem)
18802 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18803 VecTy, NewMask, CostKind);
18805 return SplatCost <= BVCost;
18807 if (!IsSplat ||
Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18811 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
18817 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18819 transform(BVMask, SplatMask.begin(), [](
int I) {
18820 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18823 BV = CreateShuffle(BV,
nullptr, SplatMask);
18826 Mask[Idx] = BVMask.size() + Idx;
18827 Vec = CreateShuffle(Vec, BV, Mask);
18835 SmallVector<int> ReuseMask(GatheredScalars.size(),
PoisonMaskElem);
18836 TryPackScalars(GatheredScalars, ReuseMask,
true);
18837 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
18838 ShuffleBuilder.add(BV, ReuseMask);
18839 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18844 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
18848 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18849 ShuffleBuilder.add(BV, Mask);
18850 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18855 Res = ShuffleBuilder.createFreeze(Res);
18859Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
18860 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
18862 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
18870 for (
Value *V : VL)
18883 IRBuilderBase::InsertPointGuard Guard(Builder);
18885 Value *
V =
E->Scalars.front();
18886 Type *ScalarTy =
V->getType();
18889 auto It = MinBWs.find(
E);
18890 if (It != MinBWs.end()) {
18896 if (
E->VectorizedValue)
18897 return E->VectorizedValue;
18899 if (
E->isGather()) {
18901 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
18902 setInsertPointAfterBundle(
E);
18903 Value *Vec = createBuildVector(
E, ScalarTy);
18904 E->VectorizedValue = Vec;
18907 if (
E->State == TreeEntry::SplitVectorize) {
18908 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
18909 "Expected exactly 2 combined entries.");
18910 setInsertPointAfterBundle(
E);
18912 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
18914 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
18915 "Expected same first part of scalars.");
18918 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
18920 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
18921 "Expected same second part of scalars.");
18923 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
18924 bool IsSigned =
false;
18925 auto It = MinBWs.find(OpE);
18926 if (It != MinBWs.end())
18927 IsSigned = It->second.second;
18930 if (isa<PoisonValue>(V))
18932 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18939 Op1 = Builder.CreateIntCast(
18944 GetOperandSignedness(&OpTE1));
18949 Op2 = Builder.CreateIntCast(
18954 GetOperandSignedness(&OpTE2));
18956 if (
E->ReorderIndices.empty()) {
18960 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
18963 if (ScalarTyNumElements != 1) {
18967 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
18969 E->CombinedEntriesWithIndices.back().second *
18970 ScalarTyNumElements);
18971 E->VectorizedValue = Vec;
18974 unsigned CommonVF =
18975 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
18978 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
18980 Op1 = Builder.CreateShuffleVector(Op1, Mask);
18984 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
18986 Op2 = Builder.CreateShuffleVector(Op2, Mask);
18988 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
18989 E->VectorizedValue = Vec;
18993 bool IsReverseOrder =
18995 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
18997 if (
E->getOpcode() == Instruction::Store &&
18998 E->State == TreeEntry::Vectorize) {
18999 ArrayRef<int>
Mask =
19000 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19001 E->ReorderIndices.size());
19002 ShuffleBuilder.add(V, Mask);
19003 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19004 E->State == TreeEntry::CompressVectorize) {
19005 ShuffleBuilder.addOrdered(V, {});
19007 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19010 E->CombinedEntriesWithIndices.size());
19012 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19013 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19016 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19017 "Expected either combined subnodes or reordering");
19018 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19021 assert(!
E->isGather() &&
"Unhandled state");
19022 unsigned ShuffleOrOp =
19023 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19025 auto GetOperandSignedness = [&](
unsigned Idx) {
19026 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19027 bool IsSigned =
false;
19028 auto It = MinBWs.find(OpE);
19029 if (It != MinBWs.end())
19030 IsSigned = It->second.second;
19033 if (isa<PoisonValue>(V))
19035 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19039 switch (ShuffleOrOp) {
19040 case Instruction::PHI: {
19041 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19042 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19043 "PHI reordering is free.");
19045 Builder.SetInsertPoint(PH->getParent(),
19046 PH->getParent()->getFirstNonPHIIt());
19048 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19052 Builder.SetInsertPoint(PH->getParent(),
19053 PH->getParent()->getFirstInsertionPt());
19056 V = FinalShuffle(V,
E);
19058 E->VectorizedValue =
V;
19065 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19072 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19076 if (!VisitedBBs.
insert(IBB).second) {
19079 TreeEntry *OpTE = getOperandEntry(
E,
I);
19080 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19081 OpTE->VectorizedValue = VecOp;
19087 Value *Vec = vectorizeOperand(
E,
I);
19088 if (VecTy != Vec->
getType()) {
19090 MinBWs.contains(getOperandEntry(
E,
I))) &&
19091 "Expected item in MinBWs.");
19092 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19098 "Invalid number of incoming values");
19099 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19100 return E->VectorizedValue;
19103 case Instruction::ExtractElement: {
19104 Value *
V =
E->getSingleOperand(0);
19105 setInsertPointAfterBundle(
E);
19106 V = FinalShuffle(V,
E);
19107 E->VectorizedValue =
V;
19110 case Instruction::ExtractValue: {
19112 Builder.SetInsertPoint(LI);
19113 Value *
Ptr = LI->getPointerOperand();
19114 LoadInst *
V = Builder.CreateAlignedLoad(VecTy,
Ptr, LI->getAlign());
19116 NewV = FinalShuffle(NewV,
E);
19117 E->VectorizedValue = NewV;
19120 case Instruction::InsertElement: {
19121 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19123 Value *
V = vectorizeOperand(
E, 1);
19125 Type *ScalarTy =
Op.front()->getType();
19128 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
19129 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19130 V = Builder.CreateIntCast(
19140 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19142 const unsigned NumElts =
19144 const unsigned NumScalars =
E->Scalars.size();
19147 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19150 SmallVector<int>
Mask;
19151 if (!
E->ReorderIndices.empty()) {
19156 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19159 bool IsIdentity =
true;
19161 Mask.swap(PrevMask);
19162 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19165 IsIdentity &= InsertIdx -
Offset ==
I;
19168 if (!IsIdentity || NumElts != NumScalars) {
19169 Value *V2 =
nullptr;
19170 bool IsVNonPoisonous =
19172 SmallVector<int> InsertMask(Mask);
19173 if (NumElts != NumScalars &&
Offset == 0) {
19182 InsertMask[*InsertIdx] = *InsertIdx;
19183 if (!
Ins->hasOneUse())
19186 Ins->getUniqueUndroppableUser());
19188 SmallBitVector UseMask =
19189 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19190 SmallBitVector IsFirstPoison =
19192 SmallBitVector IsFirstUndef =
19194 if (!IsFirstPoison.
all()) {
19196 for (
unsigned I = 0;
I < NumElts;
I++) {
19198 IsFirstUndef.
test(
I)) {
19199 if (IsVNonPoisonous) {
19200 InsertMask[
I] =
I < NumScalars ?
I : 0;
19205 if (Idx >= NumScalars)
19206 Idx = NumScalars - 1;
19207 InsertMask[
I] = NumScalars + Idx;
19220 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19222 GatherShuffleExtractSeq.insert(
I);
19223 CSEBlocks.insert(
I->getParent());
19228 for (
unsigned I = 0;
I < NumElts;
I++) {
19232 SmallBitVector UseMask =
19233 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19234 SmallBitVector IsFirstUndef =
19236 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19237 NumElts != NumScalars) {
19238 if (IsFirstUndef.
all()) {
19240 SmallBitVector IsFirstPoison =
19242 if (!IsFirstPoison.
all()) {
19243 for (
unsigned I = 0;
I < NumElts;
I++) {
19245 InsertMask[
I] =
I + NumElts;
19248 V = Builder.CreateShuffleVector(
19254 GatherShuffleExtractSeq.insert(
I);
19255 CSEBlocks.insert(
I->getParent());
19259 SmallBitVector IsFirstPoison =
19261 for (
unsigned I = 0;
I < NumElts;
I++) {
19265 InsertMask[
I] += NumElts;
19267 V = Builder.CreateShuffleVector(
19268 FirstInsert->getOperand(0), V, InsertMask,
19271 GatherShuffleExtractSeq.insert(
I);
19272 CSEBlocks.insert(
I->getParent());
19277 ++NumVectorInstructions;
19278 E->VectorizedValue =
V;
19281 case Instruction::ZExt:
19282 case Instruction::SExt:
19283 case Instruction::FPToUI:
19284 case Instruction::FPToSI:
19285 case Instruction::FPExt:
19286 case Instruction::PtrToInt:
19287 case Instruction::IntToPtr:
19288 case Instruction::SIToFP:
19289 case Instruction::UIToFP:
19290 case Instruction::Trunc:
19291 case Instruction::FPTrunc:
19292 case Instruction::BitCast: {
19293 setInsertPointAfterBundle(
E);
19295 Value *InVec = vectorizeOperand(
E, 0);
19300 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
19302 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19305 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19306 if (SrcIt != MinBWs.end())
19307 SrcBWSz = SrcIt->second.first;
19308 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
19309 if (BWSz == SrcBWSz) {
19310 VecOpcode = Instruction::BitCast;
19311 }
else if (BWSz < SrcBWSz) {
19312 VecOpcode = Instruction::Trunc;
19313 }
else if (It != MinBWs.end()) {
19314 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19315 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19316 }
else if (SrcIt != MinBWs.end()) {
19317 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19319 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19321 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19322 !SrcIt->second.second) {
19323 VecOpcode = Instruction::UIToFP;
19325 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19327 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19328 V = FinalShuffle(V,
E);
19330 E->VectorizedValue =
V;
19331 ++NumVectorInstructions;
19334 case Instruction::FCmp:
19335 case Instruction::ICmp: {
19336 setInsertPointAfterBundle(
E);
19338 Value *
L = vectorizeOperand(
E, 0);
19339 Value *
R = vectorizeOperand(
E, 1);
19340 if (
L->getType() !=
R->getType()) {
19343 MinBWs.contains(getOperandEntry(
E, 0)) ||
19344 MinBWs.contains(getOperandEntry(
E, 1))) &&
19345 "Expected item in MinBWs.");
19350 ->getIntegerBitWidth()) {
19351 Type *CastTy =
R->getType();
19352 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19354 Type *CastTy =
L->getType();
19355 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19360 Value *
V = Builder.CreateCmp(P0, L, R);
19363 ICmp->setSameSign(
false);
19366 V = FinalShuffle(V,
E);
19368 E->VectorizedValue =
V;
19369 ++NumVectorInstructions;
19372 case Instruction::Select: {
19373 setInsertPointAfterBundle(
E);
19376 Value *True = vectorizeOperand(
E, 1);
19377 Value *False = vectorizeOperand(
E, 2);
19381 MinBWs.contains(getOperandEntry(
E, 1)) ||
19382 MinBWs.contains(getOperandEntry(
E, 2))) &&
19383 "Expected item in MinBWs.");
19384 if (True->
getType() != VecTy)
19385 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19386 if (False->
getType() != VecTy)
19387 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19392 assert(TrueNumElements >= CondNumElements &&
19393 TrueNumElements % CondNumElements == 0 &&
19394 "Cannot vectorize Instruction::Select");
19396 "Cannot vectorize Instruction::Select");
19397 if (CondNumElements != TrueNumElements) {
19400 Cond = Builder.CreateShuffleVector(
19405 "Cannot vectorize Instruction::Select");
19406 Value *
V = Builder.CreateSelect(
Cond, True, False);
19407 V = FinalShuffle(V,
E);
19409 E->VectorizedValue =
V;
19410 ++NumVectorInstructions;
19413 case Instruction::FNeg: {
19414 setInsertPointAfterBundle(
E);
19416 Value *
Op = vectorizeOperand(
E, 0);
19418 Value *
V = Builder.CreateUnOp(
19424 V = FinalShuffle(V,
E);
19426 E->VectorizedValue =
V;
19427 ++NumVectorInstructions;
19431 case Instruction::Freeze: {
19432 setInsertPointAfterBundle(
E);
19434 Value *
Op = vectorizeOperand(
E, 0);
19436 if (
Op->getType() != VecTy) {
19438 MinBWs.contains(getOperandEntry(
E, 0))) &&
19439 "Expected item in MinBWs.");
19440 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
19442 Value *
V = Builder.CreateFreeze(
Op);
19443 V = FinalShuffle(V,
E);
19445 E->VectorizedValue =
V;
19446 ++NumVectorInstructions;
19450 case Instruction::Add:
19451 case Instruction::FAdd:
19452 case Instruction::Sub:
19453 case Instruction::FSub:
19454 case Instruction::Mul:
19455 case Instruction::FMul:
19456 case Instruction::UDiv:
19457 case Instruction::SDiv:
19458 case Instruction::FDiv:
19459 case Instruction::URem:
19460 case Instruction::SRem:
19461 case Instruction::FRem:
19462 case Instruction::Shl:
19463 case Instruction::LShr:
19464 case Instruction::AShr:
19465 case Instruction::And:
19466 case Instruction::Or:
19467 case Instruction::Xor: {
19468 setInsertPointAfterBundle(
E);
19472 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19477 return CI && CI->getValue().countr_one() >= It->second.first;
19479 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
19480 E->VectorizedValue =
V;
19481 ++NumVectorInstructions;
19489 MinBWs.contains(getOperandEntry(
E, 0)) ||
19490 MinBWs.contains(getOperandEntry(
E, 1))) &&
19491 "Expected item in MinBWs.");
19493 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
19495 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
19498 Value *
V = Builder.CreateBinOp(
19505 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
19507 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19509 I->setHasNoUnsignedWrap(
false);
19512 V = FinalShuffle(V,
E);
19514 E->VectorizedValue =
V;
19515 ++NumVectorInstructions;
19519 case Instruction::Load: {
19522 setInsertPointAfterBundle(
E);
19526 FixedVectorType *StridedLoadTy =
nullptr;
19527 Value *PO = LI->getPointerOperand();
19528 if (
E->State == TreeEntry::Vectorize) {
19529 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19530 }
else if (
E->State == TreeEntry::CompressVectorize) {
19531 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19532 CompressEntryToData.at(
E);
19533 Align CommonAlignment = LI->getAlign();
19539 for (
int I : CompressMask)
19543 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
19546 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19549 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19560 }
else if (
E->State == TreeEntry::StridedVectorize) {
19563 PO = IsReverseOrder ? PtrN : Ptr0;
19564 Type *StrideTy = DL->getIndexType(PO->
getType());
19566 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
19567 StridedLoadTy = SPtrInfo.Ty;
19568 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
19569 unsigned StridedLoadEC =
19572 Value *Stride = SPtrInfo.StrideVal;
19574 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19575 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
19576 SCEVExpander Expander(*SE, *DL,
"strided-load-vec");
19577 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
19578 &*Builder.GetInsertPoint());
19581 Builder.CreateIntCast(Stride, StrideTy,
true);
19582 StrideVal = Builder.CreateMul(
19583 NewStride, ConstantInt::get(
19584 StrideTy, (IsReverseOrder ? -1 : 1) *
19586 DL->getTypeAllocSize(ScalarTy))));
19588 auto *Inst = Builder.CreateIntrinsic(
19589 Intrinsic::experimental_vp_strided_load,
19590 {StridedLoadTy, PO->
getType(), StrideTy},
19593 Builder.getInt32(StridedLoadEC)});
19594 Inst->addParamAttr(
19599 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19600 Value *VecPtr = vectorizeOperand(
E, 0);
19605 unsigned ScalarTyNumElements =
19607 unsigned VecTyNumElements =
19609 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19610 "Cannot expand getelementptr.");
19611 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19614 return Builder.getInt64(I % ScalarTyNumElements);
19616 VecPtr = Builder.CreateGEP(
19617 VecTy->getElementType(),
19618 Builder.CreateShuffleVector(
19624 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19626 Value *
V =
E->State == TreeEntry::CompressVectorize
19630 V = FinalShuffle(V,
E);
19631 E->VectorizedValue =
V;
19632 ++NumVectorInstructions;
19635 case Instruction::Store: {
19638 setInsertPointAfterBundle(
E);
19640 Value *VecValue = vectorizeOperand(
E, 0);
19641 if (VecValue->
getType() != VecTy)
19643 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19644 VecValue = FinalShuffle(VecValue,
E);
19648 if (
E->State == TreeEntry::Vectorize) {
19649 ST = Builder.CreateAlignedStore(VecValue,
Ptr,
SI->getAlign());
19651 assert(
E->State == TreeEntry::StridedVectorize &&
19652 "Expected either strided or consecutive stores.");
19653 if (!
E->ReorderIndices.empty()) {
19655 Ptr =
SI->getPointerOperand();
19658 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
19659 auto *Inst = Builder.CreateIntrinsic(
19660 Intrinsic::experimental_vp_strided_store,
19661 {VecTy,
Ptr->getType(), StrideTy},
19664 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19665 Builder.getAllOnesMask(VecTy->getElementCount()),
19666 Builder.getInt32(
E->Scalars.size())});
19667 Inst->addParamAttr(
19675 E->VectorizedValue =
V;
19676 ++NumVectorInstructions;
19679 case Instruction::GetElementPtr: {
19681 setInsertPointAfterBundle(
E);
19683 Value *Op0 = vectorizeOperand(
E, 0);
19686 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
19687 Value *OpVec = vectorizeOperand(
E, J);
19691 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19694 for (
Value *V :
E->Scalars) {
19701 V = FinalShuffle(V,
E);
19703 E->VectorizedValue =
V;
19704 ++NumVectorInstructions;
19708 case Instruction::Call: {
19710 setInsertPointAfterBundle(
E);
19715 CI,
ID, VecTy->getNumElements(),
19716 It != MinBWs.end() ? It->second.first : 0, TTI);
19719 VecCallCosts.first <= VecCallCosts.second;
19721 Value *ScalarArg =
nullptr;
19732 ScalarArg = CEI->getArgOperand(
I);
19735 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
19736 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19737 ScalarArg = Builder.getFalse();
19744 Value *OpVec = vectorizeOperand(
E,
I);
19745 ScalarArg = CEI->getArgOperand(
I);
19748 It == MinBWs.end()) {
19751 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
19752 }
else if (It != MinBWs.end()) {
19753 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
19762 if (!UseIntrinsic) {
19767 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19774 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
19777 V = FinalShuffle(V,
E);
19779 E->VectorizedValue =
V;
19780 ++NumVectorInstructions;
19783 case Instruction::ShuffleVector: {
19786 setInsertPointAfterBundle(
E);
19787 Value *Src = vectorizeOperand(
E, 0);
19790 SmallVector<int> NewMask(ThisMask.size());
19792 return SVSrc->getShuffleMask()[Mask];
19794 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19795 SVSrc->getOperand(1), NewMask);
19797 V = Builder.CreateShuffleVector(Src, ThisMask);
19802 V = FinalShuffle(V,
E);
19810 "Invalid Shuffle Vector Operand");
19814 setInsertPointAfterBundle(
E);
19815 LHS = vectorizeOperand(
E, 0);
19816 RHS = vectorizeOperand(
E, 1);
19818 setInsertPointAfterBundle(
E);
19819 LHS = vectorizeOperand(
E, 0);
19825 assert((It != MinBWs.end() ||
19826 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
19827 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
19828 MinBWs.contains(getOperandEntry(
E, 0)) ||
19829 MinBWs.contains(getOperandEntry(
E, 1))) &&
19830 "Expected item in MinBWs.");
19831 Type *CastTy = VecTy;
19837 ->getIntegerBitWidth())
19843 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
19845 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
19850 V0 = Builder.CreateBinOp(
19852 V1 = Builder.CreateBinOp(
19855 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
19858 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
19861 unsigned SrcBWSz = DL->getTypeSizeInBits(
19863 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19864 if (BWSz <= SrcBWSz) {
19865 if (BWSz < SrcBWSz)
19866 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
19868 "Expected same type as operand.");
19872 E->VectorizedValue =
LHS;
19873 ++NumVectorInstructions;
19877 V0 = Builder.CreateCast(
19879 V1 = Builder.CreateCast(
19884 for (
Value *V : {V0, V1}) {
19886 GatherShuffleExtractSeq.insert(
I);
19887 CSEBlocks.insert(
I->getParent());
19895 SmallVector<int>
Mask;
19896 E->buildAltOpShuffleMask(
19897 [
E,
this](Instruction *
I) {
19898 assert(
E->getMatchingMainOpOrAltOp(
I) &&
19899 "Unexpected main/alternate opcode");
19903 Mask, &OpScalars, &AltScalars);
19907 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
19910 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
19912 if (isa<PoisonValue>(V))
19914 auto *IV = cast<Instruction>(V);
19915 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19917 I->setHasNoUnsignedWrap(
false);
19919 DropNuwFlag(V0,
E->getOpcode());
19920 DropNuwFlag(V1,
E->getAltOpcode());
19926 V = Builder.CreateShuffleVector(V0, V1, Mask);
19929 GatherShuffleExtractSeq.insert(
I);
19930 CSEBlocks.insert(
I->getParent());
19934 E->VectorizedValue =
V;
19935 ++NumVectorInstructions;
19953 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
19956 EntryToLastInstruction.clear();
19958 for (
auto &BSIter : BlocksSchedules)
19959 scheduleBlock(*
this, BSIter.second.get());
19962 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19963 if (TE->isGather())
19965 (void)getLastInstructionInBundle(TE.get());
19969 Builder.SetInsertPoint(ReductionRoot->
getParent(),
19972 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
19976 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19977 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
19978 TE->UserTreeIndex.UserTE->hasState() &&
19979 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
19980 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
19981 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
19982 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
19983 all_of(TE->UserTreeIndex.UserTE->Scalars,
19984 [](
Value *V) { return isUsedOutsideBlock(V); })) {
19986 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
19990 for (
auto &Entry : GatherEntries) {
19992 Builder.SetInsertPoint(Entry.second);
19993 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
19998 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19999 if (GatheredLoadsEntriesFirst.has_value() &&
20000 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20001 (!TE->isGather() || TE->UserTreeIndex)) {
20002 assert((TE->UserTreeIndex ||
20003 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20004 "Expected gathered load node.");
20013 for (
const TreeEntry *E : PostponedNodes) {
20014 auto *TE =
const_cast<TreeEntry *
>(E);
20016 TE->VectorizedValue =
nullptr;
20035 if (UI->comesBefore(InsertPt))
20038 Builder.SetInsertPoint(InsertPt);
20040 Builder.SetInsertPoint(PrevVec);
20042 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20045 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20046 Builder.GetInsertPoint()->comesBefore(VecI))
20047 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20048 Builder.GetInsertPoint());
20049 if (Vec->
getType() != PrevVec->getType()) {
20051 PrevVec->getType()->isIntOrIntVectorTy() &&
20052 "Expected integer vector types only.");
20053 std::optional<bool> IsSigned;
20054 for (
Value *V : TE->Scalars) {
20056 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20057 auto It = MinBWs.find(MNTE);
20058 if (It != MinBWs.end()) {
20059 IsSigned = IsSigned.value_or(
false) || It->second.second;
20064 if (IsSigned.value_or(
false))
20067 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20068 auto It = MinBWs.find(BVE);
20069 if (It != MinBWs.end()) {
20070 IsSigned = IsSigned.value_or(
false) || It->second.second;
20075 if (IsSigned.value_or(
false))
20079 IsSigned.value_or(
false) ||
20083 if (IsSigned.value_or(
false))
20087 if (IsSigned.value_or(
false)) {
20089 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20090 if (It != MinBWs.end())
20091 IsSigned = It->second.second;
20094 "Expected user node or perfect diamond match in MinBWs.");
20095 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20097 PrevVec->replaceAllUsesWith(Vec);
20098 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20101 auto It = PostponedValues.
find(PrevVec);
20102 if (It != PostponedValues.
end()) {
20103 for (TreeEntry *VTE : It->getSecond())
20104 VTE->VectorizedValue = Vec;
20124 for (
const auto &ExternalUse : ExternalUses) {
20125 Value *Scalar = ExternalUse.Scalar;
20132 const TreeEntry *E = &ExternalUse.E;
20133 assert(E &&
"Invalid scalar");
20134 assert(!E->isGather() &&
"Extracting from a gather list");
20136 if (E->getOpcode() == Instruction::GetElementPtr &&
20140 Value *Vec = E->VectorizedValue;
20141 assert(Vec &&
"Can't find vectorizable value");
20143 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20144 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20145 if (Scalar->getType() != Vec->
getType()) {
20146 Value *Ex =
nullptr;
20147 Value *ExV =
nullptr;
20149 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20150 auto It = ScalarToEEs.
find(Scalar);
20151 if (It != ScalarToEEs.
end()) {
20154 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20155 : Builder.GetInsertBlock());
20156 if (EEIt != It->second.end()) {
20157 Value *PrevV = EEIt->second.first;
20159 I && !ReplaceInst &&
20160 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20161 Builder.GetInsertPoint()->comesBefore(
I)) {
20162 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20163 Builder.GetInsertPoint());
20168 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20177 IgnoredExtracts.
insert(EE);
20180 auto *CloneInst = Inst->clone();
20181 CloneInst->insertBefore(Inst->getIterator());
20182 if (Inst->hasName())
20183 CloneInst->takeName(Inst);
20188 Value *V = ES->getVectorOperand();
20191 V = ETEs.front()->VectorizedValue;
20193 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20194 IV->comesBefore(IVec))
20195 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20197 Ex = Builder.CreateExtractElement(Vec, Lane);
20198 }
else if (
auto *VecTy =
20201 unsigned VecTyNumElements = VecTy->getNumElements();
20206 ExternalUse.Lane * VecTyNumElements);
20208 Ex = Builder.CreateExtractElement(Vec, Lane);
20213 if (Scalar->getType() != Ex->
getType())
20214 ExV = Builder.CreateIntCast(
20219 : &F->getEntryBlock(),
20220 std::make_pair(Ex, ExV));
20226 GatherShuffleExtractSeq.insert(ExI);
20227 CSEBlocks.insert(ExI->getParent());
20233 "In-tree scalar of vector type is not insertelement?");
20242 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20245 (ExternallyUsedValues.
count(Scalar) ||
20246 ExternalUsesWithNonUsers.count(Scalar) ||
20247 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20251 if (ExternalUsesAsOriginalScalar.contains(U))
20253 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20254 return !UseEntries.empty() &&
20255 (E->State == TreeEntry::Vectorize ||
20256 E->State == TreeEntry::StridedVectorize ||
20257 E->State == TreeEntry::CompressVectorize) &&
20258 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20259 return (UseEntry->State == TreeEntry::Vectorize ||
20261 TreeEntry::StridedVectorize ||
20263 TreeEntry::CompressVectorize) &&
20264 doesInTreeUserNeedToExtract(
20265 Scalar, getRootEntryInstruction(*UseEntry),
20269 "Scalar with nullptr User must be registered in "
20270 "ExternallyUsedValues map or remain as scalar in vectorized "
20274 if (
PHI->getParent()->isLandingPad())
20275 Builder.SetInsertPoint(
20278 PHI->getParent()->getLandingPadInst()->getIterator()));
20280 Builder.SetInsertPoint(
PHI->getParent(),
20281 PHI->getParent()->getFirstNonPHIIt());
20283 Builder.SetInsertPoint(VecI->getParent(),
20284 std::next(VecI->getIterator()));
20287 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20289 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20291 if (Scalar != NewInst) {
20294 "Extractelements should not be replaced.");
20295 Scalar->replaceAllUsesWith(NewInst);
20305 if (!UsedInserts.
insert(VU).second)
20308 auto BWIt = MinBWs.find(E);
20310 auto *ScalarTy = FTy->getElementType();
20311 auto Key = std::make_pair(Vec, ScalarTy);
20312 auto VecIt = VectorCasts.
find(
Key);
20313 if (VecIt == VectorCasts.
end()) {
20316 if (IVec->getParent()->isLandingPad())
20317 Builder.SetInsertPoint(IVec->getParent(),
20318 std::next(IVec->getParent()
20319 ->getLandingPadInst()
20322 Builder.SetInsertPoint(
20323 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20325 Builder.SetInsertPoint(IVec->getNextNode());
20327 Vec = Builder.CreateIntCast(
20332 BWIt->second.second);
20335 Vec = VecIt->second;
20342 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20349 unsigned Idx = *InsertIdx;
20350 if (It == ShuffledInserts.
end()) {
20352 It = std::next(ShuffledInserts.
begin(),
20353 ShuffledInserts.
size() - 1);
20358 Mask[Idx] = ExternalUse.Lane;
20370 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
20371 if (PH->getIncomingValue(
I) == Scalar) {
20373 PH->getIncomingBlock(
I)->getTerminator();
20375 Builder.SetInsertPoint(VecI->getParent(),
20376 std::next(VecI->getIterator()));
20378 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
20380 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20381 PH->setOperand(
I, NewInst);
20386 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20390 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20391 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20402 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20404 CombinedMask1[
I] = Mask[
I];
20406 CombinedMask2[
I] = Mask[
I] - VF;
20408 ShuffleInstructionBuilder ShuffleBuilder(
20410 ShuffleBuilder.add(V1, CombinedMask1);
20412 ShuffleBuilder.add(V2, CombinedMask2);
20413 return ShuffleBuilder.finalize({}, {}, {});
20416 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
20417 bool ForSingleMask) {
20418 unsigned VF =
Mask.size();
20421 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20422 Vec = CreateShuffle(Vec,
nullptr, Mask);
20423 return std::make_pair(Vec,
true);
20425 if (!ForSingleMask) {
20427 for (
unsigned I = 0;
I < VF; ++
I) {
20431 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20435 return std::make_pair(Vec,
false);
20439 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
20442 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
20443 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
20444 Builder.SetInsertPoint(LastInsert);
20445 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20450 return cast<VectorType>(Vec->getType())
20451 ->getElementCount()
20452 .getKnownMinValue();
20455 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20457 assert((Vals.size() == 1 || Vals.size() == 2) &&
20458 "Expected exactly 1 or 2 input values.");
20459 if (Vals.size() == 1) {
20462 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20463 ->getNumElements() ||
20464 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20465 return CreateShuffle(Vals.front(), nullptr, Mask);
20466 return Vals.front();
20468 return CreateShuffle(Vals.
front() ? Vals.
front()
20470 Vals.
back(), Mask);
20472 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20474 InsertElementInst *
II =
nullptr;
20475 if (It != ShuffledInserts[
I].InsertElements.rend())
20478 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20479 assert(
II &&
"Must be an insertelement instruction.");
20486 for (Instruction *
II :
reverse(Inserts)) {
20487 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20489 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20490 II->moveAfter(NewI);
20494 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
20495 IE->replaceUsesOfWith(
IE->getOperand(0),
20497 IE->replaceUsesOfWith(
IE->getOperand(1),
20501 CSEBlocks.insert(LastInsert->
getParent());
20506 for (
auto &TEPtr : VectorizableTree) {
20507 TreeEntry *
Entry = TEPtr.get();
20510 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20513 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20516 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20519 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20523 EE && IgnoredExtracts.contains(EE))
20530 for (User *U :
Scalar->users()) {
20535 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20538 "Deleting out-of-tree value");
20542 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20551 V->mergeDIAssignID(RemovedInsts);
20554 if (UserIgnoreList) {
20555 for (Instruction *
I : RemovedInsts) {
20556 const TreeEntry *
IE = getTreeEntries(
I).front();
20557 if (
IE->Idx != 0 &&
20558 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20559 (ValueToGatherNodes.lookup(
I).contains(
20560 VectorizableTree.front().get()) ||
20561 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20562 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20563 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20564 IE->UserTreeIndex &&
20566 !(GatheredLoadsEntriesFirst.has_value() &&
20567 IE->Idx >= *GatheredLoadsEntriesFirst &&
20568 VectorizableTree.front()->isGather() &&
20570 !(!VectorizableTree.front()->isGather() &&
20571 VectorizableTree.front()->isCopyableElement(
I)))
20576 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20577 (match(U.getUser(), m_LogicalAnd()) ||
20578 match(U.getUser(), m_LogicalOr())) &&
20579 U.getOperandNo() == 0;
20580 if (IsPoisoningLogicalOp) {
20581 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20584 return UserIgnoreList->contains(
U.getUser());
20588 for (SelectInst *SI : LogicalOpSelects)
20598 Builder.ClearInsertionPoint();
20599 InstrElementSize.clear();
20601 const TreeEntry &RootTE = *VectorizableTree.front();
20602 Value *Vec = RootTE.VectorizedValue;
20603 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20604 It != MinBWs.end() &&
20605 ReductionBitWidth != It->second.first) {
20606 IRBuilder<>::InsertPointGuard Guard(Builder);
20607 Builder.SetInsertPoint(ReductionRoot->getParent(),
20608 ReductionRoot->getIterator());
20609 Vec = Builder.CreateIntCast(
20613 It->second.second);
20619 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
20620 <<
" gather sequences instructions.\n");
20627 Loop *L = LI->getLoopFor(
I->getParent());
20632 BasicBlock *PreHeader = L->getLoopPreheader();
20640 auto *OpI = dyn_cast<Instruction>(V);
20641 return OpI && L->contains(OpI);
20647 CSEBlocks.insert(PreHeader);
20652 CSEWorkList.
reserve(CSEBlocks.size());
20655 assert(DT->isReachableFromEntry(
N));
20662 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
20663 "Different nodes should have different DFS numbers");
20664 return A->getDFSNumIn() <
B->getDFSNumIn();
20672 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
20675 if (I1->getType() != I2->getType())
20680 return I1->isIdenticalTo(I2);
20681 if (SI1->isIdenticalTo(SI2))
20683 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
20684 if (SI1->getOperand(
I) != SI2->getOperand(
I))
20687 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20691 unsigned LastUndefsCnt = 0;
20692 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
20698 NewMask[
I] != SM1[
I])
20701 NewMask[
I] = SM1[
I];
20705 return SM1.
size() - LastUndefsCnt > 1 &&
20709 SM1.
size() - LastUndefsCnt));
20715 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
20717 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
20718 "Worklist not sorted properly!");
20725 !GatherShuffleExtractSeq.contains(&In))
20730 bool Replaced =
false;
20733 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20734 DT->dominates(V->getParent(), In.getParent())) {
20735 In.replaceAllUsesWith(V);
20738 if (!NewMask.
empty())
20739 SI->setShuffleMask(NewMask);
20744 GatherShuffleExtractSeq.contains(V) &&
20745 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20746 DT->dominates(In.getParent(), V->getParent())) {
20748 V->replaceAllUsesWith(&In);
20751 if (!NewMask.
empty())
20752 SI->setShuffleMask(NewMask);
20760 Visited.push_back(&In);
20765 GatherShuffleExtractSeq.clear();
20768BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20771 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20772 for (
Value *V : VL) {
20773 if (S.isNonSchedulable(V))
20776 if (S.isCopyableElement(V)) {
20778 ScheduleCopyableData &SD =
20779 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
20781 BundlePtr->add(&SD);
20784 ScheduleData *BundleMember = getScheduleData(V);
20785 assert(BundleMember &&
"no ScheduleData for bundle member "
20786 "(maybe not in same basic block)");
20788 BundlePtr->add(BundleMember);
20789 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
20792 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
20798std::optional<BoUpSLP::ScheduleBundle *>
20800 const InstructionsState &S,
20807 bool HasCopyables = S.areInstructionsWithCopyableElements();
20809 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
20813 SmallVector<ScheduleData *> ControlDependentMembers;
20814 for (
Value *V : VL) {
20816 if (!
I || (HasCopyables && S.isCopyableElement(V)))
20818 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20819 for (
const Use &U :
I->operands()) {
20822 .first->getSecond();
20825 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
20826 if (ScheduleData *OpSD = getScheduleData(
Op);
20827 OpSD && OpSD->hasValidDependencies()) {
20828 OpSD->clearDirectDependencies();
20829 if (RegionHasStackSave ||
20831 ControlDependentMembers.
push_back(OpSD);
20836 if (!ControlDependentMembers.
empty()) {
20837 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20838 calculateDependencies(
Invalid,
true, SLP,
20839 ControlDependentMembers);
20846 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
20848 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
20851 SmallVector<ScheduleData *> ControlDependentMembers;
20852 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20853 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20854 for (ScheduleEntity *SE : Bundle.getBundle()) {
20856 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20857 BundleMember && BundleMember->hasValidDependencies()) {
20858 BundleMember->clearDirectDependencies();
20859 if (RegionHasStackSave ||
20861 BundleMember->getInst()))
20862 ControlDependentMembers.
push_back(BundleMember);
20867 if (SD->hasValidDependencies() &&
20868 (!S.areInstructionsWithCopyableElements() ||
20869 !S.isCopyableElement(SD->getInst())) &&
20870 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
20871 EI.UserTE->hasState() &&
20872 (!EI.UserTE->hasCopyableElements() ||
20873 !EI.UserTE->isCopyableElement(SD->getInst())))
20874 SD->clearDirectDependencies();
20875 for (
const Use &U : SD->getInst()->operands()) {
20878 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
20879 .first->getSecond();
20882 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
20884 if (ScheduleData *OpSD = getScheduleData(
Op);
20885 OpSD && OpSD->hasValidDependencies()) {
20886 OpSD->clearDirectDependencies();
20887 if (RegionHasStackSave ||
20889 ControlDependentMembers.
push_back(OpSD);
20900 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20901 for_each(ScheduleDataMap, [&](
auto &
P) {
20902 if (BB !=
P.first->getParent())
20904 ScheduleData *SD =
P.second;
20905 if (isInSchedulingRegion(*SD))
20906 SD->clearDependencies();
20908 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
20909 for_each(
P.second, [&](ScheduleCopyableData *SD) {
20910 if (isInSchedulingRegion(*SD))
20911 SD->clearDependencies();
20918 if (Bundle && !Bundle.getBundle().empty()) {
20919 if (S.areInstructionsWithCopyableElements() ||
20920 !ScheduleCopyableDataMap.empty())
20921 CheckIfNeedToClearDeps(Bundle);
20922 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
20924 calculateDependencies(Bundle, !ReSchedule, SLP,
20925 ControlDependentMembers);
20926 }
else if (!ControlDependentMembers.
empty()) {
20927 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20928 calculateDependencies(
Invalid, !ReSchedule, SLP,
20929 ControlDependentMembers);
20934 initialFillReadyList(ReadyInsts);
20941 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20942 !ReadyInsts.empty()) {
20943 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20944 assert(Picked->isReady() &&
"must be ready to schedule");
20945 schedule(*SLP, S, EI, Picked, ReadyInsts);
20946 if (Picked == &Bundle)
20953 for (
Value *V : VL) {
20954 if (S.isNonSchedulable(V))
20956 if (!extendSchedulingRegion(V, S)) {
20963 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20964 TryScheduleBundleImpl(
false,
Invalid);
20965 return std::nullopt;
20969 bool ReSchedule =
false;
20970 for (
Value *V : VL) {
20971 if (S.isNonSchedulable(V))
20975 if (!CopyableData.
empty()) {
20976 for (ScheduleCopyableData *SD : CopyableData)
20977 ReadyInsts.remove(SD);
20979 ScheduleData *BundleMember = getScheduleData(V);
20980 assert((BundleMember || S.isCopyableElement(V)) &&
20981 "no ScheduleData for bundle member (maybe not in same basic block)");
20987 ReadyInsts.remove(BundleMember);
20989 !Bundles.
empty()) {
20990 for (ScheduleBundle *
B : Bundles)
20991 ReadyInsts.remove(
B);
20994 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21001 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21002 <<
" was already scheduled\n");
21006 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21007 TryScheduleBundleImpl(ReSchedule, Bundle);
21008 if (!Bundle.isReady()) {
21009 for (ScheduleEntity *BD : Bundle.getBundle()) {
21013 if (BD->isReady()) {
21015 if (Bundles.
empty()) {
21016 ReadyInsts.insert(BD);
21019 for (ScheduleBundle *
B : Bundles)
21021 ReadyInsts.insert(
B);
21024 ScheduledBundlesList.pop_back();
21025 SmallVector<ScheduleData *> ControlDependentMembers;
21026 SmallPtrSet<Instruction *, 4> Visited;
21027 for (
Value *V : VL) {
21028 if (S.isNonSchedulable(V))
21031 if (S.isCopyableElement(
I)) {
21034 auto KV = std::make_pair(EI,
I);
21035 assert(ScheduleCopyableDataMap.contains(KV) &&
21036 "no ScheduleCopyableData for copyable element");
21037 ScheduleCopyableData *SD =
21038 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
21039 ScheduleCopyableDataMapByUsers[
I].remove(SD);
21042 const auto *It =
find(
Op,
I);
21043 assert(It !=
Op.end() &&
"Lane not set");
21044 SmallPtrSet<Instruction *, 4> Visited;
21046 int Lane = std::distance(
Op.begin(), It);
21047 assert(Lane >= 0 &&
"Lane not set");
21049 !EI.UserTE->ReorderIndices.empty())
21050 Lane = EI.UserTE->ReorderIndices[Lane];
21051 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21052 "Couldn't find extract lane");
21054 if (!Visited.
insert(In).second) {
21058 ScheduleCopyableDataMapByInstUser
21059 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
21062 }
while (It !=
Op.end());
21064 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
21065 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
21067 if (ScheduleCopyableDataMapByUsers[
I].
empty())
21068 ScheduleCopyableDataMapByUsers.erase(
I);
21069 ScheduleCopyableDataMap.erase(KV);
21071 if (ScheduleData *OpSD = getScheduleData(
I);
21072 OpSD && OpSD->hasValidDependencies()) {
21073 OpSD->clearDirectDependencies();
21074 if (RegionHasStackSave ||
21076 ControlDependentMembers.
push_back(OpSD);
21080 ScheduledBundles.find(
I)->getSecond().pop_back();
21082 if (!ControlDependentMembers.
empty()) {
21083 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21084 calculateDependencies(
Invalid,
false, SLP,
21085 ControlDependentMembers);
21087 return std::nullopt;
21092BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21094 if (ChunkPos >= ChunkSize) {
21095 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21098 return &(ScheduleDataChunks.back()[ChunkPos++]);
21101bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21102 Value *V,
const InstructionsState &S) {
21104 assert(
I &&
"bundle member must be an instruction");
21105 if (getScheduleData(
I))
21107 if (!ScheduleStart) {
21109 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21111 ScheduleEnd =
I->getNextNode();
21112 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21113 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21121 ++ScheduleStart->getIterator().getReverse();
21127 return II->isAssumeLikeIntrinsic();
21130 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21131 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21132 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21134 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21135 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21142 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21143 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21145 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21146 assert(
I->getParent() == ScheduleStart->getParent() &&
21147 "Instruction is in wrong basic block.");
21148 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21154 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21155 "Expected to reach top of the basic block or instruction down the "
21157 assert(
I->getParent() == ScheduleEnd->getParent() &&
21158 "Instruction is in wrong basic block.");
21159 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21161 ScheduleEnd =
I->getNextNode();
21162 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21163 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21167void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21169 ScheduleData *PrevLoadStore,
21170 ScheduleData *NextLoadStore) {
21171 ScheduleData *CurrentLoadStore = PrevLoadStore;
21176 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21178 SD = allocateScheduleDataChunks();
21179 ScheduleDataMap[
I] = SD;
21181 assert(!isInSchedulingRegion(*SD) &&
21182 "new ScheduleData already in scheduling region");
21183 SD->init(SchedulingRegionID,
I);
21185 if (
I->mayReadOrWriteMemory() &&
21189 Intrinsic::pseudoprobe))) {
21191 if (CurrentLoadStore) {
21192 CurrentLoadStore->setNextLoadStore(SD);
21194 FirstLoadStoreInRegion = SD;
21196 CurrentLoadStore = SD;
21201 RegionHasStackSave =
true;
21203 if (NextLoadStore) {
21204 if (CurrentLoadStore)
21205 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21207 LastLoadStoreInRegion = CurrentLoadStore;
21211void BoUpSLP::BlockScheduling::calculateDependencies(
21212 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21214 SmallVector<ScheduleEntity *> WorkList;
21215 auto ProcessNode = [&](ScheduleEntity *SE) {
21217 if (CD->hasValidDependencies())
21220 CD->initDependencies();
21221 CD->resetUnscheduledDeps();
21222 const EdgeInfo &EI = CD->getEdgeInfo();
21225 const auto *It =
find(
Op, CD->getInst());
21226 assert(It !=
Op.end() &&
"Lane not set");
21227 SmallPtrSet<Instruction *, 4> Visited;
21229 int Lane = std::distance(
Op.begin(), It);
21230 assert(Lane >= 0 &&
"Lane not set");
21232 !EI.UserTE->ReorderIndices.empty())
21233 Lane = EI.UserTE->ReorderIndices[Lane];
21234 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21235 "Couldn't find extract lane");
21237 if (EI.UserTE->isCopyableElement(In)) {
21240 if (ScheduleCopyableData *UseSD =
21241 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21242 CD->incDependencies();
21243 if (!UseSD->isScheduled())
21244 CD->incrementUnscheduledDeps(1);
21245 if (!UseSD->hasValidDependencies() ||
21246 (InsertInReadyList && UseSD->isReady()))
21249 }
else if (Visited.
insert(In).second) {
21250 if (ScheduleData *UseSD = getScheduleData(In)) {
21251 CD->incDependencies();
21252 if (!UseSD->isScheduled())
21253 CD->incrementUnscheduledDeps(1);
21254 if (!UseSD->hasValidDependencies() ||
21255 (InsertInReadyList && UseSD->isReady()))
21260 }
while (It !=
Op.end());
21261 if (CD->isReady() && CD->getDependencies() == 0 &&
21262 (EI.UserTE->hasState() &&
21263 (EI.UserTE->getMainOp()->getParent() !=
21264 CD->getInst()->getParent() ||
21266 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21267 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21268 auto *IU = dyn_cast<Instruction>(U);
21271 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21277 CD->incDependencies();
21278 CD->incrementUnscheduledDeps(1);
21284 if (BundleMember->hasValidDependencies())
21286 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21287 BundleMember->initDependencies();
21288 BundleMember->resetUnscheduledDeps();
21290 SmallDenseMap<Value *, unsigned> UserToNumOps;
21291 for (User *U : BundleMember->getInst()->users()) {
21294 if (ScheduleData *UseSD = getScheduleData(U)) {
21298 if (areAllOperandsReplacedByCopyableData(
21301 BundleMember->incDependencies();
21302 if (!UseSD->isScheduled())
21303 BundleMember->incrementUnscheduledDeps(1);
21304 if (!UseSD->hasValidDependencies() ||
21305 (InsertInReadyList && UseSD->isReady()))
21309 for (ScheduleCopyableData *UseSD :
21310 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21311 BundleMember->incDependencies();
21312 if (!UseSD->isScheduled())
21313 BundleMember->incrementUnscheduledDeps(1);
21314 if (!UseSD->hasValidDependencies() ||
21315 (InsertInReadyList && UseSD->isReady()))
21319 SmallPtrSet<const Instruction *, 4> Visited;
21322 if (!Visited.
insert(
I).second)
21324 auto *DepDest = getScheduleData(
I);
21325 assert(DepDest &&
"must be in schedule window");
21326 DepDest->addControlDependency(BundleMember);
21327 BundleMember->incDependencies();
21328 if (!DepDest->isScheduled())
21329 BundleMember->incrementUnscheduledDeps(1);
21330 if (!DepDest->hasValidDependencies() ||
21331 (InsertInReadyList && DepDest->isReady()))
21339 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21340 I != ScheduleEnd;
I =
I->getNextNode()) {
21345 MakeControlDependent(
I);
21353 if (RegionHasStackSave) {
21358 match(BundleMember->getInst(),
21360 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21361 I != ScheduleEnd;
I =
I->getNextNode()) {
21372 MakeControlDependent(
I);
21382 BundleMember->getInst()->mayReadOrWriteMemory()) {
21383 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21384 I != ScheduleEnd;
I =
I->getNextNode()) {
21390 MakeControlDependent(
I);
21397 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21398 if (!NextLoadStore)
21402 "NextLoadStore list for non memory effecting bundle?");
21405 unsigned NumAliased = 0;
21406 unsigned DistToSrc = 1;
21407 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21409 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21410 DepDest = DepDest->getNextLoadStore()) {
21411 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21421 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21423 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21430 DepDest->addMemoryDependency(BundleMember);
21431 BundleMember->incDependencies();
21432 if (!DepDest->isScheduled())
21433 BundleMember->incrementUnscheduledDeps(1);
21434 if (!DepDest->hasValidDependencies() ||
21435 (InsertInReadyList && DepDest->isReady()))
21459 "expected at least one instruction to schedule");
21461 WorkList.
push_back(Bundle.getBundle().front());
21463 SmallPtrSet<ScheduleBundle *, 16> Visited;
21464 while (!WorkList.
empty()) {
21469 CopyableBundle.
push_back(&CD->getBundle());
21470 Bundles = CopyableBundle;
21472 Bundles = getScheduleBundles(SD->getInst());
21474 if (Bundles.
empty()) {
21475 if (!SD->hasValidDependencies())
21477 if (InsertInReadyList && SD->isReady()) {
21478 ReadyInsts.insert(SD);
21479 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21483 for (ScheduleBundle *Bundle : Bundles) {
21484 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21486 assert(isInSchedulingRegion(*Bundle) &&
21487 "ScheduleData not in scheduling region");
21488 for_each(Bundle->getBundle(), ProcessNode);
21490 if (InsertInReadyList && SD->isReady()) {
21491 for (ScheduleBundle *Bundle : Bundles) {
21492 assert(isInSchedulingRegion(*Bundle) &&
21493 "ScheduleData not in scheduling region");
21494 if (!Bundle->isReady())
21496 ReadyInsts.insert(Bundle);
21504void BoUpSLP::BlockScheduling::resetSchedule() {
21506 "tried to reset schedule on block which has not been scheduled");
21507 for_each(ScheduleDataMap, [&](
auto &
P) {
21508 if (BB !=
P.first->getParent())
21510 ScheduleData *SD =
P.second;
21511 if (isInSchedulingRegion(*SD)) {
21512 SD->setScheduled(
false);
21513 SD->resetUnscheduledDeps();
21516 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21517 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21518 if (isInSchedulingRegion(*SD)) {
21519 SD->setScheduled(false);
21520 SD->resetUnscheduledDeps();
21524 for_each(ScheduledBundles, [&](
auto &
P) {
21525 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21526 if (isInSchedulingRegion(*Bundle))
21527 Bundle->setScheduled(false);
21531 for (
auto &
P : ScheduleCopyableDataMap) {
21532 if (isInSchedulingRegion(*
P.second)) {
21533 P.second->setScheduled(
false);
21534 P.second->resetUnscheduledDeps();
21537 ReadyInsts.clear();
21540void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21541 if (!BS->ScheduleStart)
21544 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21551 BS->resetSchedule();
21558 struct ScheduleDataCompare {
21559 bool operator()(
const ScheduleEntity *SD1,
21560 const ScheduleEntity *SD2)
const {
21561 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21564 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21569 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21570 I =
I->getNextNode()) {
21572 if (!Bundles.
empty()) {
21573 for (ScheduleBundle *Bundle : Bundles) {
21574 Bundle->setSchedulingPriority(Idx++);
21575 if (!Bundle->hasValidDependencies())
21576 BS->calculateDependencies(*Bundle,
false,
this);
21579 for (ScheduleCopyableData *SD :
reverse(SDs)) {
21580 ScheduleBundle &Bundle = SD->getBundle();
21581 Bundle.setSchedulingPriority(Idx++);
21582 if (!Bundle.hasValidDependencies())
21583 BS->calculateDependencies(Bundle,
false,
this);
21588 BS->getScheduleCopyableDataUsers(
I);
21589 if (ScheduleData *SD = BS->getScheduleData(
I)) {
21592 SDTEs.
front()->doesNotNeedToSchedule() ||
21594 "scheduler and vectorizer bundle mismatch");
21595 SD->setSchedulingPriority(Idx++);
21596 if (!SD->hasValidDependencies() &&
21597 (!CopyableData.
empty() ||
21598 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
21599 assert(TE->isGather() &&
"expected gather node");
21600 return TE->hasState() && TE->hasCopyableElements() &&
21601 TE->isCopyableElement(I);
21607 ScheduleBundle Bundle;
21609 BS->calculateDependencies(Bundle,
false,
this);
21612 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
21613 ScheduleBundle &Bundle = SD->getBundle();
21614 Bundle.setSchedulingPriority(Idx++);
21615 if (!Bundle.hasValidDependencies())
21616 BS->calculateDependencies(Bundle,
false,
this);
21619 BS->initialFillReadyList(ReadyInsts);
21621 Instruction *LastScheduledInst = BS->ScheduleEnd;
21624 SmallPtrSet<Instruction *, 16> Scheduled;
21625 while (!ReadyInsts.empty()) {
21626 auto *Picked = *ReadyInsts.begin();
21627 ReadyInsts.erase(ReadyInsts.begin());
21632 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21633 Instruction *PickedInst = BundleMember->getInst();
21635 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21636 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21637 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
21639 if (PickedInst->
getNextNode() != LastScheduledInst)
21641 LastScheduledInst = PickedInst;
21643 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21644 LastScheduledInst);
21648 if (PickedInst->
getNextNode() != LastScheduledInst)
21650 LastScheduledInst = PickedInst;
21652 auto Invalid = InstructionsState::invalid();
21657#ifdef EXPENSIVE_CHECKS
21661#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21663 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21664 I =
I->getNextNode()) {
21667 [](
const ScheduleBundle *Bundle) {
21668 return Bundle->isScheduled();
21670 "must be scheduled at this point");
21675 BS->ScheduleStart =
nullptr;
21683 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21688 auto E = InstrElementSize.find(V);
21689 if (E != InstrElementSize.end())
21706 Value *FirstNonBool =
nullptr;
21707 while (!Worklist.
empty()) {
21712 auto *Ty =
I->getType();
21715 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21723 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21731 for (
Use &U :
I->operands()) {
21733 if (Visited.
insert(J).second &&
21739 FirstNonBool = U.get();
21750 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21752 Width = DL->getTypeSizeInBits(V->getType());
21756 InstrElementSize[
I] = Width;
21761bool BoUpSLP::collectValuesToDemote(
21762 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
21765 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
21770 unsigned OrigBitWidth =
21771 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21778 if (NodesToKeepBWs.
contains(E.Idx))
21784 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
21785 if (isa<PoisonValue>(R))
21787 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21789 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
21792 if (getTreeEntries(V).
size() > 1)
21798 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
21804 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21809 unsigned BitWidth2 =
21810 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21811 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21817 BitWidth1 = std::min(BitWidth1, BitWidth2);
21822 auto FinalAnalysis = [&, TTI = TTI]() {
21823 if (!IsProfitableToDemote)
21826 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
21828 if (Res &&
E.isGather()) {
21829 if (
E.hasState()) {
21830 if (
const TreeEntry *SameTE =
21831 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
21833 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
21834 ToDemote, Visited, NodesToKeepBWs,
21835 MaxDepthLevel, IsProfitableToDemote,
21843 SmallPtrSet<Value *, 4> UniqueBases;
21844 for (
Value *V :
E.Scalars) {
21848 UniqueBases.
insert(EE->getVectorOperand());
21850 const unsigned VF =
E.Scalars.size();
21851 Type *OrigScalarTy =
E.Scalars.front()->getType();
21852 if (UniqueBases.
size() <= 2 ||
21865 if (
E.isGather() || !Visited.
insert(&
E).second ||
21867 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21868 return isa<InsertElementInst>(U) && !isVectorized(U);
21871 return FinalAnalysis();
21874 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21875 return isVectorized(U) ||
21876 (E.Idx == 0 && UserIgnoreList &&
21877 UserIgnoreList->contains(U)) ||
21878 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21879 !U->getType()->isScalableTy() &&
21880 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21881 }) && !IsPotentiallyTruncated(V,
BitWidth);
21886 bool &NeedToExit) {
21887 NeedToExit =
false;
21888 unsigned InitLevel = MaxDepthLevel;
21890 unsigned Level = InitLevel;
21891 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
21892 ToDemote, Visited, NodesToKeepBWs, Level,
21893 IsProfitableToDemote, IsTruncRoot)) {
21894 if (!IsProfitableToDemote)
21897 if (!FinalAnalysis())
21901 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21905 auto AttemptCheckBitwidth =
21906 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
21908 NeedToExit =
false;
21909 unsigned BestFailBitwidth = 0;
21911 if (Checker(
BitWidth, OrigBitWidth))
21913 if (BestFailBitwidth == 0 && FinalAnalysis())
21917 if (BestFailBitwidth == 0) {
21928 auto TryProcessInstruction =
21930 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
21934 for (
Value *V :
E.Scalars)
21935 (void)IsPotentiallyTruncated(V,
BitWidth);
21940 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
21943 bool NeedToExit =
false;
21944 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21948 if (!ProcessOperands(
Operands, NeedToExit))
21957 return IsProfitableToDemote;
21960 if (
E.State == TreeEntry::SplitVectorize)
21961 return TryProcessInstruction(
21963 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
21964 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
21966 switch (
E.getOpcode()) {
21970 case Instruction::Trunc:
21971 if (IsProfitableToDemoteRoot)
21972 IsProfitableToDemote =
true;
21973 return TryProcessInstruction(
BitWidth);
21974 case Instruction::ZExt:
21975 case Instruction::SExt:
21976 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
21977 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
21978 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
21980 IsProfitableToDemote =
true;
21981 return TryProcessInstruction(
BitWidth);
21985 case Instruction::Add:
21986 case Instruction::Sub:
21987 case Instruction::Mul:
21988 case Instruction::And:
21989 case Instruction::Or:
21990 case Instruction::Xor: {
21991 return TryProcessInstruction(
21992 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
21994 case Instruction::Freeze:
21995 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
21996 case Instruction::Shl: {
21999 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
22001 if (isa<PoisonValue>(V))
22003 auto *I = cast<Instruction>(V);
22004 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22005 return AmtKnownBits.getMaxValue().ult(BitWidth);
22008 return TryProcessInstruction(
22009 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
22011 case Instruction::LShr: {
22015 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22017 if (isa<PoisonValue>(V))
22019 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22020 if (E.isCopyableElement(V))
22021 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22022 auto *I = cast<Instruction>(V);
22023 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22024 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22025 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22026 SimplifyQuery(*DL));
22029 return TryProcessInstruction(
22030 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22033 case Instruction::AShr: {
22037 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22039 if (isa<PoisonValue>(V))
22041 auto *I = cast<Instruction>(V);
22042 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22043 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22044 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22046 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22049 return TryProcessInstruction(
22050 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22053 case Instruction::UDiv:
22054 case Instruction::URem: {
22056 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22059 auto *I = cast<Instruction>(V);
22060 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22061 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22062 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22065 return TryProcessInstruction(
22066 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
22070 case Instruction::Select: {
22071 return TryProcessInstruction(
22072 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
22076 case Instruction::PHI: {
22077 const unsigned NumOps =
E.getNumOperands();
22080 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
22085 case Instruction::Call: {
22090 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
22091 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
22094 function_ref<bool(
unsigned,
unsigned)> CallChecker;
22095 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22098 auto *I = cast<Instruction>(V);
22099 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22100 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22101 return MaskedValueIsZero(I->getOperand(0), Mask,
22102 SimplifyQuery(*DL)) &&
22103 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22105 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22106 "Expected min/max intrinsics only.");
22107 unsigned SignBits = OrigBitWidth -
BitWidth;
22109 unsigned Op0SignBits =
22111 unsigned Op1SignBits =
22113 return SignBits <= Op0SignBits &&
22114 ((SignBits != Op0SignBits &&
22117 SimplifyQuery(*DL))) &&
22118 SignBits <= Op1SignBits &&
22119 ((SignBits != Op1SignBits &&
22124 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22127 auto *I = cast<Instruction>(V);
22128 unsigned SignBits = OrigBitWidth - BitWidth;
22129 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22130 unsigned Op0SignBits =
22131 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22132 return SignBits <= Op0SignBits &&
22133 ((SignBits != Op0SignBits &&
22134 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22135 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22138 if (
ID != Intrinsic::abs) {
22139 Operands.push_back(getOperandEntry(&
E, 1));
22140 CallChecker = CompChecker;
22142 CallChecker = AbsChecker;
22145 std::numeric_limits<InstructionCost::CostType>::max();
22147 unsigned VF =
E.Scalars.size();
22149 auto Checker = [&](
unsigned BitWidth, unsigned) {
22157 if (
Cost < BestCost) {
22163 [[maybe_unused]]
bool NeedToExit;
22164 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22174 return FinalAnalysis();
22181 bool IsStoreOrInsertElt =
22182 VectorizableTree.front()->hasState() &&
22183 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
22184 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22185 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22186 ExtraBitWidthNodes.size() <= 1 &&
22187 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22188 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22191 unsigned NodeIdx = 0;
22192 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22196 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22197 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22198 "Unexpected tree is graph.");
22202 bool IsTruncRoot =
false;
22203 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22206 if (NodeIdx != 0 &&
22207 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22208 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22209 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22210 IsTruncRoot =
true;
22212 IsProfitableToDemoteRoot =
true;
22217 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22221 auto ComputeMaxBitWidth =
22222 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22223 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22227 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22228 !NodesToKeepBWs.
contains(E.Idx) &&
22229 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22231 return V->hasOneUse() || isa<Constant>(V) ||
22232 (!V->hasNUsesOrMore(UsesLimit) &&
22233 none_of(V->users(), [&](User *U) {
22234 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22235 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22236 if (TEs.empty() || is_contained(TEs, UserTE))
22238 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22240 isa<SIToFPInst, UIToFPInst>(U) ||
22241 (UserTE->hasState() &&
22242 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22243 SelectInst>(UserTE->getMainOp()) ||
22244 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22246 unsigned UserTESz = DL->getTypeSizeInBits(
22247 UserTE->Scalars.front()->getType());
22248 if (all_of(TEs, [&](const TreeEntry *TE) {
22249 auto It = MinBWs.find(TE);
22250 return It != MinBWs.end() &&
22251 It->second.first > UserTESz;
22254 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22258 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22259 auto It = MinBWs.find(UserTE);
22260 if (It != MinBWs.end())
22261 return It->second.first;
22262 unsigned MaxBitWidth =
22263 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22264 MaxBitWidth =
bit_ceil(MaxBitWidth);
22265 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22267 return MaxBitWidth;
22273 unsigned VF = E.getVectorFactor();
22274 Type *ScalarTy = E.Scalars.front()->getType();
22281 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22290 unsigned MaxBitWidth = 1u;
22298 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22299 if (isa<PoisonValue>(R))
22301 KnownBits Known = computeKnownBits(R, *DL);
22302 return Known.isNonNegative();
22305 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22306 E.UserTreeIndex.UserTE->hasState() &&
22307 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22309 std::min(DL->getTypeSizeInBits(
22310 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22311 DL->getTypeSizeInBits(ScalarTy));
22315 for (
Value *Root : E.Scalars) {
22321 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22337 if (!IsKnownPositive)
22342 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22345 APInt Mask = DB->getDemandedBits(
I);
22346 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22348 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22351 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22356 if (NumParts > 1 &&
22364 unsigned Opcode = E.getOpcode();
22365 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22366 Opcode == Instruction::SExt ||
22367 Opcode == Instruction::ZExt || NumParts > 1;
22372 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22373 bool NeedToDemote = IsProfitableToDemote;
22375 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22376 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22377 NeedToDemote, IsTruncRoot) ||
22378 (MaxDepthLevel <= Limit &&
22379 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22380 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22381 DL->getTypeSizeInBits(TreeRootIT) /
22382 DL->getTypeSizeInBits(
22383 E.getMainOp()->getOperand(0)->getType()) >
22387 MaxBitWidth =
bit_ceil(MaxBitWidth);
22389 return MaxBitWidth;
22396 if (UserIgnoreList &&
22400 if (
all_of(*UserIgnoreList,
22405 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22406 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22407 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22408 Builder.getInt1Ty()) {
22409 ReductionBitWidth = 1;
22411 for (
Value *V : *UserIgnoreList) {
22415 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
22416 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22419 unsigned BitWidth2 = BitWidth1;
22422 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22424 ReductionBitWidth =
22425 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22427 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22428 ReductionBitWidth = 8;
22430 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22433 bool IsTopRoot = NodeIdx == 0;
22434 while (NodeIdx < VectorizableTree.size() &&
22435 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22436 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22437 RootDemotes.push_back(NodeIdx);
22439 IsTruncRoot =
true;
22441 bool IsSignedCmp =
false;
22442 if (UserIgnoreList &&
22446 IsSignedCmp =
true;
22447 while (NodeIdx < VectorizableTree.size()) {
22449 unsigned Limit = 2;
22451 ReductionBitWidth ==
22452 DL->getTypeSizeInBits(
22453 VectorizableTree.front()->Scalars.front()->getType()))
22455 unsigned MaxBitWidth = ComputeMaxBitWidth(
22456 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22457 IsTruncRoot, IsSignedCmp);
22458 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22459 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22460 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22461 else if (MaxBitWidth == 0)
22462 ReductionBitWidth = 0;
22465 for (
unsigned Idx : RootDemotes) {
22466 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
22467 uint32_t OrigBitWidth =
22468 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22469 if (OrigBitWidth > MaxBitWidth) {
22477 RootDemotes.clear();
22479 IsProfitableToDemoteRoot =
true;
22481 if (ExtraBitWidthNodes.empty()) {
22482 NodeIdx = VectorizableTree.size();
22484 unsigned NewIdx = 0;
22486 NewIdx = *ExtraBitWidthNodes.begin();
22487 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22488 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22491 NodeIdx < VectorizableTree.size() &&
22492 VectorizableTree[NodeIdx]->UserTreeIndex &&
22493 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22494 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22495 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22496 Instruction::Trunc &&
22497 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22499 NodeIdx < VectorizableTree.size() &&
22500 VectorizableTree[NodeIdx]->UserTreeIndex &&
22501 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22502 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22503 Instruction::ICmp &&
22505 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22507 auto *IC = dyn_cast<ICmpInst>(V);
22508 return IC && (IC->isSigned() ||
22509 !isKnownNonNegative(IC->getOperand(0),
22510 SimplifyQuery(*DL)) ||
22511 !isKnownNonNegative(IC->getOperand(1),
22512 SimplifyQuery(*DL)));
22518 if (MaxBitWidth == 0 ||
22522 if (UserIgnoreList)
22523 AnalyzedMinBWVals.insert_range(TreeRoot);
22530 for (
unsigned Idx : ToDemote) {
22531 TreeEntry *
TE = VectorizableTree[Idx].get();
22532 if (MinBWs.contains(TE))
22535 if (isa<PoisonValue>(R))
22537 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22539 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22580 DL = &
F.getDataLayout();
22588 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
22590 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
22595 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
22598 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
22602 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
22608 DT->updateDFSNumbers();
22611 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
22616 R.clearReductionData();
22617 collectSeedInstructions(BB);
22620 if (!Stores.empty()) {
22622 <<
" underlying objects.\n");
22623 Changed |= vectorizeStoreChains(R);
22627 Changed |= vectorizeChainsInBlock(BB, R);
22632 if (!GEPs.empty()) {
22634 <<
" underlying objects.\n");
22635 Changed |= vectorizeGEPIndices(BB, R);
22640 R.optimizeGatherSequence();
22648 unsigned Idx,
unsigned MinVF,
22653 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22654 unsigned VF = Chain.
size();
22660 VF < 2 || VF < MinVF) {
22668 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
22672 for (
Value *V : Chain)
22675 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
22676 InstructionsState S =
Analysis.buildInstructionsState(
22680 bool IsAllowedSize =
22684 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22685 (!S.getMainOp()->isSafeToRemove() ||
22688 return !isa<ExtractElementInst>(V) &&
22689 (V->getNumUses() > Chain.size() ||
22690 any_of(V->users(), [&](User *U) {
22691 return !Stores.contains(U);
22694 (ValOps.
size() > Chain.size() / 2 && !S)) {
22695 Size = (!IsAllowedSize && S) ? 1 : 2;
22699 if (
R.isLoadCombineCandidate(Chain))
22701 R.buildTree(Chain);
22703 if (
R.isTreeTinyAndNotFullyVectorizable()) {
22704 if (
R.isGathered(Chain.front()) ||
22706 return std::nullopt;
22707 Size =
R.getCanonicalGraphSize();
22710 if (
R.isProfitableToReorder()) {
22711 R.reorderTopToBottom();
22712 R.reorderBottomToTop();
22714 R.transformNodes();
22715 R.buildExternalUses();
22717 R.computeMinimumValueSizes();
22719 Size =
R.getCanonicalGraphSize();
22720 if (S && S.getOpcode() == Instruction::Load)
22728 using namespace ore;
22730 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
22732 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
22733 <<
" and with tree size "
22734 <<
NV(
"TreeSize",
R.getTreeSize()));
22748 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22749 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22750 unsigned Size = First ? Val.first : Val.second;
22762 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22763 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22764 unsigned P = First ? Val.first : Val.second;
22767 return V + (P - Mean) * (P - Mean);
22770 return Dev * 96 / (Mean * Mean) == 0;
22778class RelatedStoreInsts {
22781 : AllStores(AllStores) {
22782 reset(BaseInstrIdx);
22785 void reset(
unsigned NewBaseInstr) {
22786 assert(NewBaseInstr < AllStores.size() &&
22787 "Instruction index out of bounds");
22788 BaseInstrIdx = NewBaseInstr;
22790 insertOrLookup(NewBaseInstr, 0);
22797 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
22798 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22799 return Inserted ? std::nullopt : std::make_optional(It->second);
22802 using DistToInstMap = std::map<int64_t, unsigned>;
22803 const DistToInstMap &getStores()
const {
return Instrs; }
22807 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
22808 ScalarEvolution &SE)
const {
22809 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22812 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
22818 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
22819 int64_t DistFromCurBase) {
22820 DistToInstMap PrevSet = std::move(Instrs);
22821 reset(NewBaseInstIdx);
22826 for (
auto [Dist, InstIdx] : PrevSet) {
22827 if (InstIdx >= MinSafeIdx)
22828 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22834 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
22835 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
22836 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
22841 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22842 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22847 unsigned BaseInstrIdx;
22850 DistToInstMap Instrs;
22858bool SLPVectorizerPass::vectorizeStores(
22860 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22867 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22868 int64_t PrevDist = -1;
22872 auto &[Dist, InstIdx] =
Data;
22873 if (
Operands.empty() || Dist - PrevDist == 1) {
22874 Operands.push_back(Stores[InstIdx]);
22876 if (Idx != StoreSeq.size() - 1)
22881 Operands.push_back(Stores[InstIdx]);
22887 .
insert({Operands.front(),
22888 cast<StoreInst>(Operands.front())->getValueOperand(),
22890 cast<StoreInst>(Operands.back())->getValueOperand(),
22895 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
22896 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
22900 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22902 Type *StoreTy =
Store->getValueOperand()->getType();
22903 Type *ValueTy = StoreTy;
22905 ValueTy = Trunc->getSrcTy();
22914 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22917 MinVF = std::max<unsigned>(2, MinVF);
22919 if (MaxVF < MinVF) {
22920 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22922 <<
"MinVF (" << MinVF <<
")\n");
22926 unsigned NonPowerOf2VF = 0;
22931 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
22933 NonPowerOf2VF = CandVF;
22934 assert(NonPowerOf2VF != MaxVF &&
22935 "Non-power-of-2 VF should not be equal to MaxVF");
22942 unsigned MaxRegVF = MaxVF;
22945 if (MaxVF < MinVF) {
22946 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
22948 <<
"MinVF (" << MinVF <<
")\n");
22952 SmallVector<unsigned> CandidateVFs;
22953 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
22958 unsigned Repeat = 0;
22959 constexpr unsigned MaxAttempts = 4;
22960 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(
Operands.size());
22961 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
22962 P.first =
P.second = 1;
22963 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
22964 auto IsNotVectorized = [](
bool First,
22965 const std::pair<unsigned, unsigned> &
P) {
22966 return First ?
P.first > 0 :
P.second > 0;
22968 auto IsVectorized = [](
bool First,
22969 const std::pair<unsigned, unsigned> &
P) {
22970 return First ?
P.first == 0 :
P.second == 0;
22972 auto VFIsProfitable = [](
bool First,
unsigned Size,
22973 const std::pair<unsigned, unsigned> &
P) {
22976 auto FirstSizeSame = [](
unsigned Size,
22977 const std::pair<unsigned, unsigned> &
P) {
22978 return Size ==
P.first;
22982 bool RepeatChanged =
false;
22983 bool AnyProfitableGraph =
false;
22984 for (
unsigned VF : CandidateVFs) {
22985 AnyProfitableGraph =
false;
22986 unsigned FirstUnvecStore =
22987 std::distance(RangeSizes.begin(),
22988 find_if(RangeSizes, std::bind(IsNotVectorized,
22989 VF >= MaxRegVF, _1)));
22993 while (FirstUnvecStore < End) {
22994 unsigned FirstVecStore = std::distance(
22995 RangeSizes.begin(),
22996 find_if(RangeSizes.drop_front(FirstUnvecStore),
22997 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
22998 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
22999 for (
unsigned SliceStartIdx = FirstUnvecStore;
23000 SliceStartIdx + VF <= MaxSliceEnd;) {
23011 ->getValueOperand()
23014 ->getValueOperand()
23017 "Expected all operands of same type.");
23018 if (!NonSchedulable.
empty()) {
23019 auto [NonSchedSizeMax, NonSchedSizeMin] =
23021 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23024 SliceStartIdx += NonSchedSizeMax;
23029 std::optional<bool> Res =
23030 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23036 .first->getSecond()
23044 AnyProfitableGraph = RepeatChanged =
Changed =
true;
23047 for (std::pair<unsigned, unsigned> &
P :
23048 RangeSizes.slice(SliceStartIdx, VF))
23049 P.first =
P.second = 0;
23050 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23051 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
23052 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23053 P.first =
P.second = 0;
23054 FirstUnvecStore = SliceStartIdx + VF;
23056 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23057 for (std::pair<unsigned, unsigned> &
P :
23058 RangeSizes.slice(SliceStartIdx + VF,
23059 MaxSliceEnd - (SliceStartIdx + VF)))
23060 P.first =
P.second = 0;
23061 if (MaxSliceEnd == End)
23062 End = SliceStartIdx;
23063 MaxSliceEnd = SliceStartIdx;
23065 SliceStartIdx += VF;
23068 if (VF > 2 && Res &&
23069 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
23070 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23072 SliceStartIdx += VF;
23077 if (VF > MaxRegVF && TreeSize > 1 &&
23078 all_of(RangeSizes.slice(SliceStartIdx, VF),
23079 std::bind(FirstSizeSame, TreeSize, _1))) {
23080 SliceStartIdx += VF;
23081 while (SliceStartIdx != MaxSliceEnd &&
23082 RangeSizes[SliceStartIdx].first == TreeSize)
23086 if (TreeSize > 1) {
23087 for (std::pair<unsigned, unsigned> &
P :
23088 RangeSizes.slice(SliceStartIdx, VF)) {
23089 if (VF >= MaxRegVF)
23090 P.second = std::max(
P.second, TreeSize);
23092 P.first = std::max(
P.first, TreeSize);
23096 AnyProfitableGraph =
true;
23098 if (FirstUnvecStore >= End)
23100 if (MaxSliceEnd - FirstUnvecStore < VF &&
23101 MaxSliceEnd - FirstUnvecStore >= MinVF)
23102 AnyProfitableGraph =
true;
23103 FirstUnvecStore = std::distance(
23104 RangeSizes.begin(),
23105 find_if(RangeSizes.drop_front(MaxSliceEnd),
23106 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23108 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23112 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23113 return P.first == 0 &&
P.second == 0;
23117 if (Repeat >= MaxAttempts ||
23118 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23120 constexpr unsigned StoresLimit = 64;
23121 const unsigned MaxTotalNum = std::min<unsigned>(
23123 static_cast<unsigned>(
23126 RangeSizes.begin(),
23127 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23129 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23132 CandidateVFs.clear();
23134 CandidateVFs.push_back(Limit);
23135 if (VF > MaxTotalNum || VF >= StoresLimit)
23137 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23139 P.first = std::max(
P.second,
P.first);
23143 CandidateVFs.push_back(VF);
23183 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
23184 std::optional<int64_t> PtrDist;
23185 auto *RelatedStores =
find_if(
23186 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23187 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23188 return PtrDist.has_value();
23192 if (RelatedStores == SortedStores.
end()) {
23200 if (std::optional<unsigned> PrevInst =
23201 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23202 TryToVectorize(RelatedStores->getStores());
23203 RelatedStores->clearVectorizedStores(VectorizedStores);
23204 RelatedStores->rebase(*PrevInst + 1,
23209 Type *PrevValTy =
nullptr;
23211 if (
R.isDeleted(SI))
23214 PrevValTy =
SI->getValueOperand()->getType();
23216 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23217 for (RelatedStoreInsts &StoreSeq : SortedStores)
23218 TryToVectorize(StoreSeq.getStores());
23219 SortedStores.clear();
23220 PrevValTy =
SI->getValueOperand()->getType();
23222 FillStoresSet(
I, SI);
23226 for (RelatedStoreInsts &StoreSeq : SortedStores)
23227 TryToVectorize(StoreSeq.getStores());
23232void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23240 for (Instruction &
I : *BB) {
23244 if (!
SI->isSimple())
23255 if (
GEP->getNumIndices() != 1)
23257 Value *Idx =
GEP->idx_begin()->get();
23262 if (
GEP->getType()->isVectorTy())
23274 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23275 << VL.
size() <<
".\n");
23286 for (
Value *V : VL) {
23287 Type *Ty =
V->getType();
23291 R.getORE()->emit([&]() {
23292 std::string TypeStr;
23293 llvm::raw_string_ostream OS(TypeStr);
23295 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
23296 <<
"Cannot SLP vectorize list: type "
23297 << TypeStr +
" is unsupported by vectorizer";
23304 unsigned Sz =
R.getVectorElementSize(I0);
23305 unsigned MinVF =
R.getMinVF(Sz);
23306 unsigned MaxVF = std::max<unsigned>(
23308 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23310 R.getORE()->emit([&]() {
23311 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
23312 <<
"Cannot SLP vectorize list: vectorization factor "
23313 <<
"less than 2 is not supported";
23319 bool CandidateFound =
false;
23322 unsigned NextInst = 0, MaxInst = VL.size();
23323 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23329 if (TTI->getNumberOfParts(VecTy) == VF)
23331 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23332 unsigned ActualVF = std::min(MaxInst -
I, VF);
23337 if (MaxVFOnly && ActualVF < MaxVF)
23339 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23344 for (
Value *V : VL.drop_front(
I)) {
23348 !Inst || !
R.isDeleted(Inst)) {
23351 if (Idx == ActualVF)
23356 if (Idx != ActualVF)
23359 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23363 if (
R.isTreeTinyAndNotFullyVectorizable())
23365 if (
R.isProfitableToReorder()) {
23366 R.reorderTopToBottom();
23369 R.transformNodes();
23370 R.buildExternalUses();
23372 R.computeMinimumValueSizes();
23374 CandidateFound =
true;
23375 MinCost = std::min(MinCost,
Cost);
23378 <<
" for VF=" << ActualVF <<
"\n");
23381 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
23383 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23384 <<
" and with tree size "
23385 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23396 if (!
Changed && CandidateFound) {
23397 R.getORE()->emit([&]() {
23398 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
23399 <<
"List vectorization was possible but not beneficial with cost "
23400 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23404 R.getORE()->emit([&]() {
23405 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
23406 <<
"Cannot SLP vectorize list: vectorization was impossible"
23407 <<
" with available vectorization factors";
23442 using ReductionOpsType = SmallVector<Value *, 16>;
23443 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23444 ReductionOpsListType ReductionOps;
23448 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23449 WeakTrackingVH ReductionRoot;
23454 bool IsSupportedHorRdxIdentityOp =
false;
23461 static bool isCmpSelMinMax(Instruction *
I) {
23469 static bool isBoolLogicOp(Instruction *
I) {
23475 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
23476 bool TwoElementReduction =
false) {
23477 if (Kind == RecurKind::None)
23486 if (TwoElementReduction)
23489 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23493 return I->getFastMathFlags().noNaNs();
23496 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23499 return I->isAssociative();
23502 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
23508 return I->getOperand(2);
23509 return I->getOperand(Index);
23514 Value *
RHS,
const Twine &Name,
bool UseSelect) {
23518 case RecurKind::Or: {
23527 case RecurKind::And: {
23536 case RecurKind::Add:
23537 case RecurKind::Mul:
23538 case RecurKind::Xor:
23539 case RecurKind::FAdd:
23540 case RecurKind::FMul: {
23545 case RecurKind::SMax:
23546 case RecurKind::SMin:
23547 case RecurKind::UMax:
23548 case RecurKind::UMin:
23555 case RecurKind::FMax:
23556 case RecurKind::FMin:
23557 case RecurKind::FMaximum:
23558 case RecurKind::FMinimum:
23559 case RecurKind::FMaximumNum:
23560 case RecurKind::FMinimumNum: {
23573 const ReductionOpsListType &ReductionOps) {
23574 bool UseSelect = ReductionOps.size() == 2 ||
23576 (ReductionOps.size() == 1 &&
23578 assert((!UseSelect || ReductionOps.size() != 2 ||
23580 "Expected cmp + select pairs for reduction");
23581 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
23599 return RecurKind::None;
23601 return RecurKind::Add;
23603 return RecurKind::Mul;
23606 return RecurKind::And;
23609 return RecurKind::Or;
23611 return RecurKind::Xor;
23613 return RecurKind::FAdd;
23615 return RecurKind::FMul;
23618 return RecurKind::FMax;
23620 return RecurKind::FMin;
23623 return RecurKind::FMaximum;
23625 return RecurKind::FMinimum;
23631 return RecurKind::SMax;
23633 return RecurKind::SMin;
23635 return RecurKind::UMax;
23637 return RecurKind::UMin;
23663 return RecurKind::None;
23667 return RecurKind::None;
23670 return RecurKind::None;
23674 return RecurKind::None;
23679 return RecurKind::None;
23682 return RecurKind::SMax;
23685 return RecurKind::SMin;
23688 return RecurKind::UMax;
23691 return RecurKind::UMin;
23694 return RecurKind::None;
23698 static unsigned getFirstOperandIndex(Instruction *
I) {
23699 return isCmpSelMinMax(
I) ? 1 : 0;
23704 static unsigned getNumberOfOperands(Instruction *
I) {
23705 return isCmpSelMinMax(
I) ? 3 : 2;
23710 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
23711 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
23714 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
23716 return I->getParent() == BB;
23720 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
23721 if (IsCmpSelMinMax) {
23725 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23726 return I->hasNUses(2);
23734 void initReductionOps(Instruction *
I) {
23735 if (isCmpSelMinMax(
I))
23736 ReductionOps.assign(2, ReductionOpsType());
23738 ReductionOps.assign(1, ReductionOpsType());
23742 void addReductionOps(Instruction *
I) {
23743 if (isCmpSelMinMax(
I)) {
23745 ReductionOps[1].emplace_back(
I);
23747 ReductionOps[0].emplace_back(
I);
23752 int Sz =
Data.size();
23761 : ReductionRoot(
I), ReductionLimit(2) {
23762 RdxKind = HorizontalReduction::getRdxKind(
I);
23763 ReductionOps.emplace_back().push_back(
I);
23766 ReducedValsToOps[
V].push_back(
I);
23769 bool matchReductionForOperands()
const {
23772 assert(ReductionRoot &&
"Reduction root is not set!");
23775 return Ops.size() == 2;
23783 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23784 ScalarEvolution &SE,
const DataLayout &
DL,
23785 const TargetLibraryInfo &TLI) {
23786 RdxKind = HorizontalReduction::getRdxKind(Root);
23787 if (!isVectorizable(RdxKind, Root))
23799 if (!Sel->getCondition()->hasOneUse())
23802 ReductionRoot = Root;
23807 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23809 1, std::make_pair(Root, 0));
23814 SmallVectorImpl<Value *> &PossibleReducedVals,
23815 SmallVectorImpl<Instruction *> &ReductionOps,
23818 getNumberOfOperands(TreeN)))) {
23819 Value *EdgeVal = getRdxOperand(TreeN,
I);
23820 ReducedValsToOps[EdgeVal].push_back(TreeN);
23828 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23829 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23830 !isVectorizable(RdxKind, EdgeInst) ||
23831 (
R.isAnalyzedReductionRoot(EdgeInst) &&
23833 PossibleReducedVals.push_back(EdgeVal);
23836 ReductionOps.push_back(EdgeInst);
23845 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
23847 PossibleReducedVals;
23848 initReductionOps(Root);
23850 SmallSet<size_t, 2> LoadKeyUsed;
23852 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
23857 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
23858 if (LIt != LoadsMap.
end()) {
23859 for (LoadInst *RLI : LIt->second) {
23865 for (LoadInst *RLI : LIt->second) {
23872 if (LIt->second.size() > 2) {
23874 hash_value(LIt->second.back()->getPointerOperand());
23880 .first->second.push_back(LI);
23884 while (!Worklist.empty()) {
23885 auto [TreeN,
Level] = Worklist.pop_back_val();
23888 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23889 addReductionOps(TreeN);
23892 for (
Value *V : PossibleRedVals) {
23896 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
23898 for (Instruction *
I :
reverse(PossibleReductionOps))
23899 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
23901 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
23904 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
23905 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
23907 for (
auto &Slice : PossibleRedVals) {
23909 auto RedValsVect = Slice.second.takeVector();
23911 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
23912 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
23914 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
23915 return P1.size() > P2.size();
23922 }
else if (!isGoodForReduction(
Data)) {
23925 if (!LI || !LastLI ||
23930 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
23936 return P1.size() > P2.
size();
23942 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
23943 const TargetLibraryInfo &TLI, AssumptionCache *AC,
23944 DominatorTree &DT) {
23945 constexpr unsigned RegMaxNumber = 4;
23946 constexpr unsigned RedValsMaxNumber = 128;
23950 if (
unsigned NumReducedVals = std::accumulate(
23951 ReducedVals.
begin(), ReducedVals.
end(), 0,
23953 if (!isGoodForReduction(Vals))
23955 return Num + Vals.size();
23957 NumReducedVals < ReductionLimit &&
23961 for (ReductionOpsType &RdxOps : ReductionOps)
23962 for (
Value *RdxOp : RdxOps)
23967 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
23973 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
23974 ReducedVals.
front().size());
23978 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
23980 "Expected min/max reduction to have select root instruction");
23983 "Expected min/max reduction to have compare condition");
23987 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
23988 return isBoolLogicOp(cast<Instruction>(V));
23991 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
23992 if (VectorizedTree) {
23996 if (AnyBoolLogicOp) {
23997 auto It = ReducedValsToOps.
find(VectorizedTree);
23998 auto It1 = ReducedValsToOps.
find(Res);
23999 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
24001 (It != ReducedValsToOps.
end() &&
24002 any_of(It->getSecond(), [&](Instruction *
I) {
24003 return isBoolLogicOp(I) &&
24004 getRdxOperand(I, 0) == VectorizedTree;
24008 (It1 != ReducedValsToOps.
end() &&
24009 any_of(It1->getSecond(), [&](Instruction *
I) {
24010 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24014 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
24018 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
24024 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24025 ReductionOps.front().size());
24026 for (ReductionOpsType &RdxOps : ReductionOps)
24027 for (
Value *RdxOp : RdxOps) {
24030 IgnoreList.insert(RdxOp);
24033 FastMathFlags RdxFMF;
24035 for (
Value *U : IgnoreList)
24037 RdxFMF &= FPMO->getFastMathFlags();
24043 for (
Value *V : Candidates)
24044 TrackedVals.try_emplace(V, V);
24046 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24047 Value *
V) ->
unsigned & {
24048 auto *It = MV.
find(V);
24049 assert(It != MV.
end() &&
"Unable to find given key.");
24053 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
24056 SmallPtrSet<Value *, 4> RequiredExtract;
24057 WeakTrackingVH VectorizedTree =
nullptr;
24058 bool CheckForReusedReductionOps =
false;
24063 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
24065 InstructionsState S = States[
I];
24068 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
24069 for (
Value *ReducedVal : OrigReducedVals) {
24070 Value *RdxVal = TrackedVals.at(ReducedVal);
24077 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24081 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24083 bool ShuffledExtracts =
false;
24085 if (S && S.getOpcode() == Instruction::ExtractElement &&
24086 !S.isAltShuffle() &&
I + 1 <
E) {
24088 for (
Value *RV : ReducedVals[
I + 1]) {
24089 Value *RdxVal = TrackedVals.at(RV);
24096 CommonCandidates.push_back(RdxVal);
24097 TrackedToOrig.try_emplace(RdxVal, RV);
24099 SmallVector<int>
Mask;
24102 Candidates.
swap(CommonCandidates);
24103 ShuffledExtracts =
true;
24110 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24111 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24113 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24114 Value *OrigV = TrackedToOrig.at(VC);
24115 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24117 V.analyzedReductionRoot(ResI);
24119 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24123 unsigned NumReducedVals = Candidates.
size();
24124 if (NumReducedVals < ReductionLimit &&
24125 (NumReducedVals < 2 || !
isSplat(Candidates)))
24130 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24131 RdxKind != RecurKind::FMul &&
24132 RdxKind != RecurKind::FMulAdd;
24134 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24135 if (IsSupportedHorRdxIdentityOp)
24136 for (
Value *V : Candidates) {
24137 Value *OrigV = TrackedToOrig.at(V);
24138 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24150 bool SameScaleFactor =
false;
24151 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24152 SameValuesCounter.
size() != Candidates.size();
24154 if (OptReusedScalars) {
24156 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24157 RdxKind == RecurKind::Xor) &&
24159 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24160 return P.second == SameValuesCounter.
front().second;
24162 Candidates.resize(SameValuesCounter.
size());
24163 transform(SameValuesCounter, Candidates.begin(),
24164 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24165 NumReducedVals = Candidates.size();
24167 if (NumReducedVals == 1) {
24168 Value *OrigV = TrackedToOrig.at(Candidates.front());
24169 unsigned Cnt = At(SameValuesCounter, OrigV);
24171 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24172 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24173 VectorizedVals.try_emplace(OrigV, Cnt);
24174 ExternallyUsedValues.
insert(OrigV);
24179 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24180 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24181 const unsigned MaxElts = std::clamp<unsigned>(
24183 RegMaxNumber * RedValsMaxNumber);
24185 unsigned ReduxWidth = NumReducedVals;
24186 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24187 unsigned NumParts, NumRegs;
24188 Type *ScalarTy = Candidates.front()->getType();
24195 while (NumParts > NumRegs) {
24196 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24197 ReduxWidth =
bit_floor(ReduxWidth - 1);
24203 if (NumParts > NumRegs / 2)
24208 ReduxWidth = GetVectorFactor(ReduxWidth);
24209 ReduxWidth = std::min(ReduxWidth, MaxElts);
24211 unsigned Start = 0;
24212 unsigned Pos =
Start;
24214 unsigned PrevReduxWidth = ReduxWidth;
24215 bool CheckForReusedReductionOpsLocal =
false;
24216 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24217 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24218 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24221 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24224 if (Pos < NumReducedVals - ReduxWidth + 1)
24225 return IsAnyRedOpGathered;
24228 if (ReduxWidth > 1)
24229 ReduxWidth = GetVectorFactor(ReduxWidth);
24230 return IsAnyRedOpGathered;
24232 bool AnyVectorized =
false;
24233 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24234 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24235 ReduxWidth >= ReductionLimit) {
24238 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24240 CheckForReusedReductionOps =
true;
24243 PrevReduxWidth = ReduxWidth;
24246 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24249 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24251 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24253 V.areAnalyzedReductionVals(VL)) {
24254 (void)AdjustReducedVals(
true);
24261 return RedValI &&
V.isDeleted(RedValI);
24264 V.buildTree(VL, IgnoreList);
24265 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24266 if (!AdjustReducedVals())
24267 V.analyzedReductionVals(VL);
24270 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24271 if (!AdjustReducedVals())
24272 V.analyzedReductionVals(VL);
24275 V.reorderTopToBottom();
24278 VL.front()->getType()->isIntOrIntVectorTy() ||
24279 ReductionLimit > 2);
24283 ExternallyUsedValues);
24287 LocalExternallyUsedValues.insert(ReductionRoot);
24288 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24289 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24291 for (
Value *V : ReducedVals[Cnt])
24293 LocalExternallyUsedValues.insert(TrackedVals[V]);
24295 if (!IsSupportedHorRdxIdentityOp) {
24298 "Reused values counter map is not empty");
24299 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24300 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24302 Value *
V = Candidates[Cnt];
24303 Value *OrigV = TrackedToOrig.at(V);
24304 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24307 V.transformNodes();
24310 SmallPtrSet<Value *, 4> Visited;
24311 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24312 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24314 Value *RdxVal = Candidates[Cnt];
24315 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24316 RdxVal = It->second;
24317 if (!Visited.
insert(RdxVal).second)
24321 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24322 LocalExternallyUsedValues.insert(RdxVal);
24325 Value *OrigV = TrackedToOrig.at(RdxVal);
24327 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24328 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
24329 LocalExternallyUsedValues.insert(RdxVal);
24332 if (!IsSupportedHorRdxIdentityOp)
24333 SameValuesCounter.
clear();
24334 for (
Value *RdxVal : VL)
24335 if (RequiredExtract.
contains(RdxVal))
24336 LocalExternallyUsedValues.insert(RdxVal);
24337 V.buildExternalUses(LocalExternallyUsedValues);
24339 V.computeMinimumValueSizes();
24343 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
24346 <<
" for reduction\n");
24350 V.getORE()->emit([&]() {
24351 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
24352 ReducedValsToOps.
at(VL[0]).front())
24353 <<
"Vectorizing horizontal reduction is possible "
24354 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24355 <<
" and threshold "
24358 if (!AdjustReducedVals()) {
24359 V.analyzedReductionVals(VL);
24361 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24364 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24365 VF >= ReductionLimit;
24367 *
TTI, VL.front()->getType(), VF - 1)) {
24369 V.getCanonicalGraphSize() !=
V.getTreeSize())
24372 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
24379 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24380 <<
Cost <<
". (HorRdx)\n");
24381 V.getORE()->emit([&]() {
24382 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
24383 ReducedValsToOps.
at(VL[0]).front())
24384 <<
"Vectorized horizontal reduction with cost "
24385 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24386 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24395 if (IsCmpSelMinMax)
24396 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24399 Value *VectorizedRoot =
V.vectorizeTree(
24400 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24403 for (
Value *RdxVal : Candidates) {
24404 Value *OrigVal = TrackedToOrig.at(RdxVal);
24405 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24406 if (TransformedRdxVal != RdxVal)
24407 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24416 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24419 if (OptReusedScalars && !SameScaleFactor) {
24420 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24421 SameValuesCounter, TrackedToOrig);
24424 Type *ScalarTy = VL.front()->getType();
24429 OptReusedScalars && SameScaleFactor
24430 ? SameValuesCounter.
front().second
24433 ?
V.isSignedMinBitwidthRootNode()
24437 for (
Value *RdxVal : VL) {
24438 Value *OrigV = TrackedToOrig.at(RdxVal);
24439 if (IsSupportedHorRdxIdentityOp) {
24440 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24443 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24444 if (!
V.isVectorized(RdxVal))
24445 RequiredExtract.
insert(RdxVal);
24449 ReduxWidth = NumReducedVals - Pos;
24450 if (ReduxWidth > 1)
24451 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24452 AnyVectorized =
true;
24454 if (OptReusedScalars && !AnyVectorized) {
24455 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24456 Value *RdxVal = TrackedVals.at(
P.first);
24457 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24458 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24459 VectorizedVals.try_emplace(
P.first,
P.second);
24464 if (!VectorValuesAndScales.
empty())
24465 VectorizedTree = GetNewVectorizedTree(
24467 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24469 if (!VectorizedTree) {
24470 if (!CheckForReusedReductionOps) {
24471 for (ReductionOpsType &RdxOps : ReductionOps)
24472 for (
Value *RdxOp : RdxOps)
24494 auto FixBoolLogicalOps =
24497 if (!AnyBoolLogicOp)
24499 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24500 getRdxOperand(RedOp1, 0) ==
LHS ||
24503 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24504 getRdxOperand(RedOp2, 0) ==
RHS ||
24509 if (
LHS != VectorizedTree)
24517 unsigned Sz = InstVals.
size();
24519 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24522 Value *RdxVal1 = InstVals[
I].second;
24523 Value *StableRdxVal1 = RdxVal1;
24524 auto It1 = TrackedVals.find(RdxVal1);
24525 if (It1 != TrackedVals.end())
24526 StableRdxVal1 = It1->second;
24527 Value *RdxVal2 = InstVals[
I + 1].second;
24528 Value *StableRdxVal2 = RdxVal2;
24529 auto It2 = TrackedVals.find(RdxVal2);
24530 if (It2 != TrackedVals.end())
24531 StableRdxVal2 = It2->second;
24535 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24537 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24538 StableRdxVal2,
"op.rdx", ReductionOps);
24539 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24542 ExtraReds[Sz / 2] = InstVals.
back();
24548 SmallPtrSet<Value *, 8> Visited;
24550 for (
Value *RdxVal : Candidates) {
24551 if (!Visited.
insert(RdxVal).second)
24553 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24554 for (Instruction *RedOp :
24560 bool InitStep =
true;
24561 while (ExtraReductions.
size() > 1) {
24563 FinalGen(ExtraReductions, InitStep);
24564 ExtraReductions.
swap(NewReds);
24567 VectorizedTree = ExtraReductions.
front().second;
24569 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24576 SmallPtrSet<Value *, 4> IgnoreSet;
24585 for (
auto *U :
Ignore->users()) {
24587 "All users must be either in the reduction ops list.");
24590 if (!
Ignore->use_empty()) {
24592 Ignore->replaceAllUsesWith(
P);
24595 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24597 return VectorizedTree;
24603 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24604 Value *Vec,
unsigned Scale,
bool IsSigned,
24628 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
24631 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
24633 if (Rdx->
getType() != DestTy)
24639 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24646 bool IsCmpSelMinMax, FastMathFlags FMF,
24647 const BoUpSLP &R, DominatorTree &DT,
24648 const DataLayout &
DL,
24649 const TargetLibraryInfo &TLI) {
24651 Type *ScalarTy = ReducedVals.
front()->getType();
24652 unsigned ReduxWidth = ReducedVals.
size();
24653 FixedVectorType *VectorTy =
R.getReductionType();
24658 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
24661 int Cnt = ReducedVals.
size();
24662 for (
Value *RdxVal : ReducedVals) {
24667 Cost += GenCostFn();
24671 for (User *U : RdxVal->
users()) {
24673 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24674 if (RdxKind == RecurKind::FAdd) {
24684 FMACost -= FMulCost;
24686 ScalarCost += FMACost;
24693 ScalarCost = InstructionCost::getInvalid();
24697 Cost += ScalarCost;
24699 Cost += GenCostFn();
24708 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
24710 case RecurKind::Add:
24711 case RecurKind::Mul:
24712 case RecurKind::Or:
24713 case RecurKind::And:
24714 case RecurKind::Xor:
24715 case RecurKind::FAdd:
24716 case RecurKind::FMul: {
24719 if (DoesRequireReductionOp) {
24722 unsigned ScalarTyNumElements = VecTy->getNumElements();
24727 ReducedVals.size()),
24738 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24739 std::make_pair(RedTy,
true));
24740 if (RType == RedTy) {
24745 RdxOpcode, !IsSigned, RedTy,
24751 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24752 std::make_pair(RedTy,
true));
24755 if (RdxKind == RecurKind::FAdd) {
24760 for (
Value *RdxVal : ReducedVals) {
24766 FMF &= FPCI->getFastMathFlags();
24769 if (!
Ops.empty()) {
24774 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24775 {RVecTy, RVecTy, RVecTy}, FMF);
24781 Instruction::FMul, RVecTy,
CostKind);
24783 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
24784 FMACost -= FMulCost;
24788 if (FMACost.isValid())
24789 VectorCost += FMACost;
24793 if (RType != RedTy) {
24794 unsigned Opcode = Instruction::Trunc;
24796 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24802 ScalarCost = EvaluateScalarCost([&]() {
24807 case RecurKind::FMax:
24808 case RecurKind::FMin:
24809 case RecurKind::FMaximum:
24810 case RecurKind::FMinimum:
24811 case RecurKind::SMax:
24812 case RecurKind::SMin:
24813 case RecurKind::UMax:
24814 case RecurKind::UMin: {
24817 if (DoesRequireReductionOp) {
24823 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24824 std::make_pair(RedTy,
true));
24826 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24828 if (RType != RedTy) {
24829 unsigned Opcode = Instruction::Trunc;
24831 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24837 ScalarCost = EvaluateScalarCost([&]() {
24838 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24847 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
24849 <<
" (It is a splitting reduction)\n");
24850 return VectorCost - ScalarCost;
24856 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24858 Value *ReducedSubTree =
nullptr;
24860 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
24861 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
24862 if (ReducedSubTree)
24863 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24864 "op.rdx", ReductionOps);
24866 ReducedSubTree = Rdx;
24868 if (VectorValuesAndScales.
size() == 1) {
24869 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
24870 CreateSingleOp(Vec, Scale, IsSigned);
24871 return ReducedSubTree;
24875 Value *VecRes =
nullptr;
24876 bool VecResSignedness =
false;
24877 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
24883 case RecurKind::Add: {
24884 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
24887 <<
". (HorRdx)\n");
24890 std::iota(std::next(
Mask.begin(), VF *
I),
24891 std::next(
Mask.begin(), VF * (
I + 1)), 0);
24892 ++NumVectorInstructions;
24903 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
24904 <<
". (HorRdx)\n");
24905 ++NumVectorInstructions;
24909 case RecurKind::Xor: {
24912 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
24917 case RecurKind::FAdd: {
24921 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
24922 <<
". (HorRdx)\n");
24923 ++NumVectorInstructions;
24927 case RecurKind::And:
24928 case RecurKind::Or:
24929 case RecurKind::SMax:
24930 case RecurKind::SMin:
24931 case RecurKind::UMax:
24932 case RecurKind::UMin:
24933 case RecurKind::FMax:
24934 case RecurKind::FMin:
24935 case RecurKind::FMaximum:
24936 case RecurKind::FMinimum:
24939 case RecurKind::Sub:
24940 case RecurKind::AddChainWithSubs:
24941 case RecurKind::Mul:
24942 case RecurKind::FMul:
24943 case RecurKind::FMulAdd:
24944 case RecurKind::AnyOf:
24945 case RecurKind::FindFirstIVSMin:
24946 case RecurKind::FindFirstIVUMin:
24947 case RecurKind::FindLastIVSMax:
24948 case RecurKind::FindLastIVUMax:
24949 case RecurKind::FMaxNum:
24950 case RecurKind::FMinNum:
24951 case RecurKind::FMaximumNum:
24952 case RecurKind::FMinimumNum:
24953 case RecurKind::None:
24960 VecResSignedness = IsSigned;
24962 ++NumVectorInstructions;
24963 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
24969 std::iota(
Mask.begin(),
Mask.end(), 0);
24971 if (VecResVF < VecVF) {
24975 if (VecResVF != VecVF) {
24977 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
24994 if (VecResVF < VecVF) {
25000 if (VecResVF != VecVF)
25002 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
25003 if (VecResVF != VecVF)
25008 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25009 CreateVecOp(Vec, Scale, IsSigned);
25010 CreateSingleOp(VecRes, 1,
false);
25012 return ReducedSubTree;
25016 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
25017 const TargetTransformInfo *
TTI,
Type *DestTy) {
25018 assert(VectorizedValue &&
"Need to have a vectorized tree node");
25019 assert(RdxKind != RecurKind::FMulAdd &&
25020 "A call to the llvm.fmuladd intrinsic is not handled yet");
25023 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
25024 RdxKind == RecurKind::Add &&
25029 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
25030 ++NumVectorInstructions;
25033 ++NumVectorInstructions;
25038 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
25040 assert(IsSupportedHorRdxIdentityOp &&
25041 "The optimization of matched scalar identity horizontal reductions "
25042 "must be supported.");
25044 return VectorizedValue;
25046 case RecurKind::Add: {
25048 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
25050 << VectorizedValue <<
". (HorRdx)\n");
25051 return Builder.
CreateMul(VectorizedValue, Scale);
25053 case RecurKind::Xor: {
25055 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
25056 <<
". (HorRdx)\n");
25059 return VectorizedValue;
25061 case RecurKind::FAdd: {
25063 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
25065 << VectorizedValue <<
". (HorRdx)\n");
25066 return Builder.
CreateFMul(VectorizedValue, Scale);
25068 case RecurKind::And:
25069 case RecurKind::Or:
25070 case RecurKind::SMax:
25071 case RecurKind::SMin:
25072 case RecurKind::UMax:
25073 case RecurKind::UMin:
25074 case RecurKind::FMax:
25075 case RecurKind::FMin:
25076 case RecurKind::FMaximum:
25077 case RecurKind::FMinimum:
25079 return VectorizedValue;
25080 case RecurKind::Sub:
25081 case RecurKind::AddChainWithSubs:
25082 case RecurKind::Mul:
25083 case RecurKind::FMul:
25084 case RecurKind::FMulAdd:
25085 case RecurKind::AnyOf:
25086 case RecurKind::FindFirstIVSMin:
25087 case RecurKind::FindFirstIVUMin:
25088 case RecurKind::FindLastIVSMax:
25089 case RecurKind::FindLastIVUMax:
25090 case RecurKind::FMaxNum:
25091 case RecurKind::FMinNum:
25092 case RecurKind::FMaximumNum:
25093 case RecurKind::FMinimumNum:
25094 case RecurKind::None:
25103 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25104 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25105 const DenseMap<Value *, Value *> &TrackedToOrig) {
25106 assert(IsSupportedHorRdxIdentityOp &&
25107 "The optimization of matched scalar identity horizontal reductions "
25108 "must be supported.");
25111 if (VTy->getElementType() != VL.
front()->getType()) {
25115 R.isSignedMinBitwidthRootNode());
25118 case RecurKind::Add: {
25121 for (
Value *V : VL) {
25122 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25123 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
25127 << VectorizedValue <<
". (HorRdx)\n");
25128 return Builder.
CreateMul(VectorizedValue, Scale);
25130 case RecurKind::And:
25131 case RecurKind::Or:
25134 <<
". (HorRdx)\n");
25135 return VectorizedValue;
25136 case RecurKind::SMax:
25137 case RecurKind::SMin:
25138 case RecurKind::UMax:
25139 case RecurKind::UMin:
25140 case RecurKind::FMax:
25141 case RecurKind::FMin:
25142 case RecurKind::FMaximum:
25143 case RecurKind::FMinimum:
25146 <<
". (HorRdx)\n");
25147 return VectorizedValue;
25148 case RecurKind::Xor: {
25153 SmallVector<int>
Mask(
25156 std::iota(
Mask.begin(),
Mask.end(), 0);
25157 bool NeedShuffle =
false;
25158 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25160 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25161 if (Cnt % 2 == 0) {
25163 NeedShuffle =
true;
25169 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25173 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
25174 return VectorizedValue;
25176 case RecurKind::FAdd: {
25179 for (
Value *V : VL) {
25180 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25181 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25184 return Builder.
CreateFMul(VectorizedValue, Scale);
25186 case RecurKind::Sub:
25187 case RecurKind::AddChainWithSubs:
25188 case RecurKind::Mul:
25189 case RecurKind::FMul:
25190 case RecurKind::FMulAdd:
25191 case RecurKind::AnyOf:
25192 case RecurKind::FindFirstIVSMin:
25193 case RecurKind::FindFirstIVUMin:
25194 case RecurKind::FindLastIVSMax:
25195 case RecurKind::FindLastIVUMax:
25196 case RecurKind::FMaxNum:
25197 case RecurKind::FMinNum:
25198 case RecurKind::FMaximumNum:
25199 case RecurKind::FMinimumNum:
25200 case RecurKind::None:
25210 return HorizontalReduction::getRdxKind(V);
25216 unsigned AggregateSize = 1;
25218 Type *CurrentType =
IV->getType();
25221 for (
auto *Elt : ST->elements())
25222 if (Elt != ST->getElementType(0))
25223 return std::nullopt;
25224 AggregateSize *= ST->getNumElements();
25225 CurrentType = ST->getElementType(0);
25227 AggregateSize *= AT->getNumElements();
25228 CurrentType = AT->getElementType();
25230 AggregateSize *= VT->getNumElements();
25231 return AggregateSize;
25233 return AggregateSize;
25235 return std::nullopt;
25244 unsigned OperandOffset,
const BoUpSLP &R) {
25247 std::optional<unsigned> OperandIndex =
25249 if (!OperandIndex || R.isDeleted(LastInsertInst))
25253 BuildVectorOpds, InsertElts, *OperandIndex, R);
25256 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25257 InsertElts[*OperandIndex] = LastInsertInst;
25260 }
while (LastInsertInst !=
nullptr &&
25287 "Expected insertelement or insertvalue instruction!");
25290 "Expected empty result vectors!");
25293 if (!AggregateSize)
25295 BuildVectorOpds.
resize(*AggregateSize);
25296 InsertElts.
resize(*AggregateSize);
25301 if (BuildVectorOpds.
size() >= 2)
25319 auto DominatedReduxValue = [&](
Value *R) {
25327 if (
P->getIncomingBlock(0) == ParentBB) {
25329 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25333 if (Rdx && DominatedReduxValue(Rdx))
25346 if (
P->getIncomingBlock(0) == BBLatch) {
25348 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25352 if (Rdx && DominatedReduxValue(Rdx))
25388 "Expected binop, select, or intrinsic for reduction matching");
25390 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25392 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25403 Value *Op0 =
nullptr;
25404 Value *Op1 =
nullptr;
25413 Value *B0 =
nullptr, *B1 =
nullptr;
25418bool SLPVectorizerPass::vectorizeHorReduction(
25419 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
25420 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25429 auto SelectRoot = [&]() {
25448 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25449 Stack.emplace(SelectRoot(), 0);
25450 SmallPtrSet<Value *, 8> VisitedInstrs;
25453 if (
R.isAnalyzedReductionRoot(Inst))
25458 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25460 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25462 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25463 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25475 while (!
Stack.empty()) {
25478 std::tie(Inst, Level) =
Stack.front();
25483 if (
R.isDeleted(Inst))
25485 if (
Value *VectorizedV = TryToReduce(Inst)) {
25489 Stack.emplace(
I, Level);
25492 if (
R.isDeleted(Inst))
25496 if (!TryAppendToPostponedInsts(Inst)) {
25507 if (VisitedInstrs.
insert(
Op).second)
25512 !
R.isDeleted(
I) &&
I->getParent() == BB)
25513 Stack.emplace(
I, Level);
25518bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
25525 if ((
I->getOpcode() == Instruction::FAdd ||
25526 I->getOpcode() == Instruction::FSub) &&
25536 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25537 R.isDeleted(Op0) ||
R.isDeleted(Op1))
25547 if (
A &&
B &&
B->hasOneUse()) {
25550 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
25552 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
25556 if (
B &&
A &&
A->hasOneUse()) {
25559 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
25561 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
25565 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
25569 Type *Ty = Inst->getType();
25573 if (!HorRdx.matchReductionForOperands())
25579 TTI.getScalarizationOverhead(
25582 TTI.getInstructionCost(Inst,
CostKind);
25594 FMF = FPCI->getFastMathFlags();
25595 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25602 if (RedCost >= ScalarCost)
25605 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
25607 if (Candidates.
size() == 1)
25608 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
25611 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
25612 if (!BestCandidate)
25614 return (*BestCandidate == 0 &&
25615 TryToReduce(
I, {Candidates[*BestCandidate].first,
25616 Candidates[*BestCandidate].second})) ||
25617 tryToVectorizeList({Candidates[*BestCandidate].first,
25618 Candidates[*BestCandidate].second},
25622bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
25623 BasicBlock *BB,
BoUpSLP &R) {
25625 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
25626 Res |= tryToVectorize(PostponedInsts, R);
25633 for (
Value *V : Insts)
25635 Res |= tryToVectorize(Inst, R);
25639bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25642 if (!
R.canMapToVector(IVI->
getType()))
25645 SmallVector<Value *, 16> BuildVectorOpds;
25646 SmallVector<Value *, 16> BuildVectorInsts;
25650 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
25651 R.getORE()->emit([&]() {
25652 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
25653 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
25654 "trying reduction first.";
25658 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
25660 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25663bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25666 SmallVector<Value *, 16> BuildVectorInsts;
25667 SmallVector<Value *, 16> BuildVectorOpds;
25668 SmallVector<int>
Mask;
25674 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
25675 R.getORE()->emit([&]() {
25676 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
25677 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
25678 "trying reduction first.";
25682 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
25683 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25686template <
typename T>
25691 bool MaxVFOnly,
BoUpSLP &R) {
25704 if (!
I || R.isDeleted(
I)) {
25708 auto *SameTypeIt = IncIt;
25711 AreCompatible(VL, *SameTypeIt))) {
25714 if (
I && !R.isDeleted(
I))
25719 unsigned NumElts = VL.
size();
25720 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
25721 << NumElts <<
")\n");
25731 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
25734 VL.
swap(Candidates);
25735 Candidates.
clear();
25743 auto GetMinNumElements = [&R](
Value *V) {
25744 unsigned EltSize = R.getVectorElementSize(V);
25745 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25747 if (NumElts < GetMinNumElements(*IncIt) &&
25748 (Candidates.
empty() ||
25749 Candidates.
front()->getType() == (*IncIt)->getType())) {
25757 if (Candidates.
size() > 1 &&
25758 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25759 if (TryToVectorizeHelper(Candidates,
false)) {
25762 }
else if (MaxVFOnly) {
25765 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
25768 if (!
I || R.isDeleted(
I)) {
25772 auto *SameTypeIt = It;
25773 while (SameTypeIt != End &&
25776 AreCompatible(*SameTypeIt, *It))) {
25779 if (
I && !R.isDeleted(
I))
25782 unsigned NumElts = VL.
size();
25783 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
25789 Candidates.
clear();
25793 IncIt = SameTypeIt;
25805template <
bool IsCompatibility>
25810 "Expected valid element types only.");
25812 return IsCompatibility;
25815 if (CI1->getOperand(0)->getType()->getTypeID() <
25817 return !IsCompatibility;
25818 if (CI1->getOperand(0)->getType()->getTypeID() >
25821 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25823 return !IsCompatibility;
25824 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25833 if (BasePred1 < BasePred2)
25834 return !IsCompatibility;
25835 if (BasePred1 > BasePred2)
25838 bool CI1Preds = Pred1 == BasePred1;
25839 bool CI2Preds = Pred2 == BasePred1;
25840 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
25841 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
25846 return !IsCompatibility;
25851 if (IsCompatibility) {
25852 if (I1->getParent() != I2->getParent())
25859 return NodeI2 !=
nullptr;
25862 assert((NodeI1 == NodeI2) ==
25864 "Different nodes should have different DFS numbers");
25865 if (NodeI1 != NodeI2)
25869 if (S && (IsCompatibility || !S.isAltShuffle()))
25871 if (IsCompatibility)
25873 if (I1->getOpcode() != I2->getOpcode())
25874 return I1->getOpcode() < I2->getOpcode();
25877 return IsCompatibility;
25880template <
typename ItT>
25882 BasicBlock *BB,
BoUpSLP &R) {
25885 for (CmpInst *
I : CmpInsts) {
25886 if (
R.isDeleted(
I))
25890 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
25891 if (
R.isDeleted(
I))
25896 for (CmpInst *
I : CmpInsts) {
25897 if (
R.isDeleted(
I))
25916 for (Instruction *V : CmpInsts)
25919 if (Vals.
size() <= 1)
25922 Vals, CompareSorter, AreCompatibleCompares,
25925 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
25926 return any_of(
V->users(), [V](User *U) {
25927 auto *Select = dyn_cast<SelectInst>(U);
25929 Select->getParent() != cast<Instruction>(V)->getParent();
25932 if (ArePossiblyReducedInOtherBlock)
25934 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25940bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25941 BasicBlock *BB,
BoUpSLP &R) {
25943 "This function only accepts Insert instructions");
25944 bool OpsChanged =
false;
25946 for (
auto *
I :
reverse(Instructions)) {
25952 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
25955 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
25958 if (
R.isDeleted(
I))
25960 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
25966 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
25968 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
25973 OpsChanged |= tryToVectorize(PostponedInsts, R);
25979bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
25982 SmallPtrSet<Value *, 16> VisitedInstrs;
25986 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
25987 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
25990 "Expected vectorizable types only.");
26000 V2->getType()->getScalarSizeInBits())
26003 V2->getType()->getScalarSizeInBits())
26007 if (Opcodes1.
size() < Opcodes2.
size())
26009 if (Opcodes1.
size() > Opcodes2.
size())
26011 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26020 return NodeI2 !=
nullptr;
26023 assert((NodeI1 == NodeI2) ==
26025 "Different nodes should have different DFS numbers");
26026 if (NodeI1 != NodeI2)
26029 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
26045 DT->getNode(V1->getParent());
26047 DT->getNode(V2->getParent());
26049 return NodeI2 !=
nullptr;
26052 assert((NodeI1 == NodeI2) ==
26054 "Different nodes should have different DFS numbers");
26055 if (NodeI1 != NodeI2)
26057 return V1->comesBefore(V2);
26070 return *Id1 < *Id2;
26074 if (
I1->getOpcode() == I2->getOpcode())
26076 return I1->getOpcode() < I2->getOpcode();
26099 auto ValID1 = Opcodes1[
I]->getValueID();
26100 auto ValID2 = Opcodes2[
I]->getValueID();
26101 if (ValID1 == ValID2)
26103 if (ValID1 < ValID2)
26105 if (ValID1 > ValID2)
26114 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
26120 if (VL.empty() || V1 == VL.back())
26122 Value *V2 = VL.back();
26127 if (Opcodes1.
size() != Opcodes2.
size())
26129 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26135 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
26137 if (
I1->getParent() != I2->getParent())
26145 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26151 bool HaveVectorizedPhiNodes =
false;
26155 for (Instruction &
I : *BB) {
26162 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26167 if (Incoming.
size() <= 1)
26172 for (
Value *V : Incoming) {
26173 SmallVectorImpl<Value *> &Opcodes =
26175 if (!Opcodes.
empty())
26178 SmallPtrSet<Value *, 4> Visited;
26179 while (!Nodes.empty()) {
26183 for (
Value *V :
PHI->incoming_values()) {
26185 Nodes.push_back(PHI1);
26194 Incoming, PHICompare, AreCompatiblePHIs,
26196 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26199 Changed |= HaveVectorizedPhiNodes;
26200 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26202 return !
PHI ||
R.isDeleted(
PHI);
26204 PHIToOpcodes.
clear();
26206 }
while (HaveVectorizedPhiNodes);
26208 VisitedInstrs.
clear();
26210 InstSetVector PostProcessInserts;
26211 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26214 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26215 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26216 if (VectorizeCmps) {
26218 PostProcessCmps.
clear();
26220 PostProcessInserts.clear();
26226 return PostProcessCmps.
contains(Cmp);
26228 PostProcessInserts.contains(
I);
26234 return I->use_empty() &&
26244 if (
R.isDeleted(&*It))
26247 if (!VisitedInstrs.
insert(&*It).second) {
26248 if (HasNoUsers(&*It) &&
26249 VectorizeInsertsAndCmps(It->isTerminator())) {
26262 if (
P->getNumIncomingValues() == 2) {
26265 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26279 if (BB ==
P->getIncomingBlock(
I) ||
26280 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
26286 PI && !IsInPostProcessInstrs(PI)) {
26288 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26290 if (Res &&
R.isDeleted(
P)) {
26300 if (HasNoUsers(&*It)) {
26301 bool OpsChanged =
false;
26312 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26313 SI->getValueOperand()->hasOneUse();
26315 if (TryToVectorizeRoot) {
26316 for (
auto *V : It->operand_values()) {
26320 VI && !IsInPostProcessInstrs(VI))
26322 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26329 VectorizeInsertsAndCmps(It->isTerminator());
26341 PostProcessInserts.insert(&*It);
26349bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
26351 for (
auto &Entry : GEPs) {
26354 if (
Entry.second.size() < 2)
26357 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26358 <<
Entry.second.size() <<
".\n");
26366 return !R.isDeleted(GEP);
26368 if (It ==
Entry.second.end())
26370 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26371 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26372 if (MaxVecRegSize < EltSize)
26375 unsigned MaxElts = MaxVecRegSize / EltSize;
26376 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26377 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26390 Candidates.remove_if([&R](
Value *
I) {
26400 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
26401 auto *GEPI = GEPList[
I];
26402 if (!Candidates.count(GEPI))
26404 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
26405 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
26406 auto *GEPJ = GEPList[J];
26407 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26409 Candidates.remove(GEPI);
26410 Candidates.remove(GEPJ);
26411 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26412 Candidates.remove(GEPJ);
26419 if (Candidates.
size() < 2)
26425 SmallVector<Value *, 16> Bundle(Candidates.
size());
26426 auto BundleIndex = 0
u;
26427 for (
auto *V : Candidates) {
26429 auto *GEPIdx =
GEP->idx_begin()->get();
26431 Bundle[BundleIndex++] = GEPIdx;
26443 Changed |= tryToVectorizeList(Bundle, R);
26449bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26454 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
26455 if (
V->getValueOperand()->getType()->getTypeID() <
26458 if (
V->getValueOperand()->getType()->getTypeID() >
26461 if (
V->getPointerOperandType()->getTypeID() <
26462 V2->getPointerOperandType()->getTypeID())
26464 if (
V->getPointerOperandType()->getTypeID() >
26465 V2->getPointerOperandType()->getTypeID())
26467 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26470 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26476 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26477 DT->getNode(
I1->getParent());
26478 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26479 DT->getNode(I2->getParent());
26480 assert(NodeI1 &&
"Should only process reachable instructions");
26481 assert(NodeI2 &&
"Should only process reachable instructions");
26482 assert((NodeI1 == NodeI2) ==
26484 "Different nodes should have different DFS numbers");
26485 if (NodeI1 != NodeI2)
26487 return I1->getOpcode() < I2->getOpcode();
26489 return V->getValueOperand()->getValueID() <
26493 bool SameParent =
true;
26499 StoreInst *V2 = VL.
back();
26524 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
26526 for (
auto [SI, V] :
zip(VL, NewVL))
26527 V =
SI->getValueOperand();
26528 NewVL.back() = V1->getValueOperand();
26529 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
26530 InstructionsState S =
Analysis.buildInstructionsState(
26538 return V1->getValueOperand()->
getValueID() ==
26543 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26544 for (
auto &Pair : Stores) {
26545 if (Pair.second.size() < 2)
26549 << Pair.second.size() <<
".\n");
26558 Pair.second.rend());
26560 ReversedStores, StoreSorter, AreCompatibleStores,
26562 return vectorizeStores(Candidates, R, Attempted);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
~ShuffleInstructionBuilder()
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(unsigned CounterName)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isStridedLoad(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, const int64_t Diff, StridedPtrInfo &SPtrInfo) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const