Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <memory>
99#include <optional>
100#include <set>
101#include <string>
102#include <tuple>
103#include <utility>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107using namespace slpvectorizer;
108using namespace std::placeholders;
109
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
112
113STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
114
115DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
116 "Controls which SLP graphs should be vectorized.");
117
118static cl::opt<bool>
119 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
120 cl::desc("Run the SLP vectorization passes"));
121
122static cl::opt<bool>
123 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
124 cl::desc("Enable vectorization for wider vector utilization"));
125
126static cl::opt<int>
128 cl::desc("Only vectorize if you gain more than this "
129 "number "));
130
132 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
133 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
135
136static cl::opt<bool>
137ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
138 cl::desc("Attempt to vectorize horizontal reductions"));
139
141 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
142 cl::desc(
143 "Attempt to vectorize horizontal reductions feeding into a store"));
144
146 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
147 cl::desc("Improve the code quality by splitting alternate instructions"));
148
149static cl::opt<int>
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
155 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
156
157/// Limits the size of scheduling regions in a block.
158/// It avoid long compile times for _very_ large blocks where vector
159/// instructions are spread over a wide range.
160/// This limit is way higher than needed by real-world functions.
161static cl::opt<int>
162ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
163 cl::desc("Limit the size of the SLP scheduling region per block"));
164
166 "slp-min-reg-size", cl::init(128), cl::Hidden,
167 cl::desc("Attempt to vectorize for this register size in bits"));
168
170 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
171 cl::desc("Limit the recursion depth when building a vectorizable tree"));
172
174 "slp-min-tree-size", cl::init(3), cl::Hidden,
175 cl::desc("Only vectorize small trees if they are fully vectorizable"));
176
177// The maximum depth that the look-ahead score heuristic will explore.
178// The higher this value, the higher the compilation time overhead.
180 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
181 cl::desc("The maximum look-ahead depth for operand reordering scores"));
182
183// The maximum depth that the look-ahead score heuristic will explore
184// when it probing among candidates for vectorization tree roots.
185// The higher this value, the higher the compilation time overhead but unlike
186// similar limit for operands ordering this is less frequently used, hence
187// impact of higher value is less noticeable.
189 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
190 cl::desc("The maximum look-ahead depth for searching best rooting option"));
191
193 "slp-min-strided-loads", cl::init(2), cl::Hidden,
194 cl::desc("The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
196
198 "slp-max-stride", cl::init(8), cl::Hidden,
199 cl::desc("The maximum stride, considered to be profitable."));
200
201static cl::opt<bool>
202 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
203 cl::desc("Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
205
206static cl::opt<bool>
207 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
208 cl::desc("Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
210
211static cl::opt<bool>
212 ViewSLPTree("view-slp-tree", cl::Hidden,
213 cl::desc("Display the SLP trees with Graphviz"));
214
216 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
217 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
218
219/// Enables vectorization of copyable elements.
221 "slp-copyable-elements", cl::init(true), cl::Hidden,
222 cl::desc("Try to replace values with the idempotent instructions for "
223 "better vectorization."));
224
225// Limit the number of alias checks. The limit is chosen so that
226// it has no negative effect on the llvm benchmarks.
227static const unsigned AliasedCheckLimit = 10;
228
229// Limit of the number of uses for potentially transformed instructions/values,
230// used in checks to avoid compile-time explode.
231static constexpr int UsesLimit = 64;
232
233// Another limit for the alias checks: The maximum distance between load/store
234// instructions where alias checks are done.
235// This limit is useful for very large basic blocks.
236static const unsigned MaxMemDepDistance = 160;
237
238/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
239/// regions to be handled.
240static const int MinScheduleRegionSize = 16;
241
242/// Maximum allowed number of operands in the PHI nodes.
243static const unsigned MaxPHINumOperands = 128;
244
245/// Predicate for the element types that the SLP vectorizer supports.
246///
247/// The most important thing to filter here are types which are invalid in LLVM
248/// vectors. We also filter target specific types which have absolutely no
249/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
250/// avoids spending time checking the cost model and realizing that they will
251/// be inevitably scalarized.
252static bool isValidElementType(Type *Ty) {
253 // TODO: Support ScalableVectorType.
255 Ty = Ty->getScalarType();
256 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
257 !Ty->isPPC_FP128Ty();
258}
259
260/// Returns the type of the given value/instruction \p V. If it is store,
261/// returns the type of its value operand, for Cmp - the types of the compare
262/// operands and for insertelement - the type os the inserted operand.
263/// Otherwise, just the type of the value is returned.
265 if (auto *SI = dyn_cast<StoreInst>(V))
266 return SI->getValueOperand()->getType();
267 if (auto *CI = dyn_cast<CmpInst>(V))
268 return CI->getOperand(0)->getType();
269 if (auto *IE = dyn_cast<InsertElementInst>(V))
270 return IE->getOperand(1)->getType();
271 return V->getType();
272}
273
274/// \returns the number of elements for Ty.
275static unsigned getNumElements(Type *Ty) {
277 "ScalableVectorType is not supported.");
278 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
279 return VecTy->getNumElements();
280 return 1;
281}
282
283/// \returns the vector type of ScalarTy based on vectorization factor.
284static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
285 return FixedVectorType::get(ScalarTy->getScalarType(),
286 VF * getNumElements(ScalarTy));
287}
288
289/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
290/// which forms type, which splits by \p TTI into whole vector types during
291/// legalization.
293 Type *Ty, unsigned Sz) {
294 if (!isValidElementType(Ty))
295 return bit_ceil(Sz);
296 // Find the number of elements, which forms full vectors.
297 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
298 if (NumParts == 0 || NumParts >= Sz)
299 return bit_ceil(Sz);
300 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
301}
302
303/// Returns the number of elements of the given type \p Ty, not greater than \p
304/// Sz, which forms type, which splits by \p TTI into whole vector types during
305/// legalization.
306static unsigned
308 unsigned Sz) {
309 if (!isValidElementType(Ty))
310 return bit_floor(Sz);
311 // Find the number of elements, which forms full vectors.
312 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
313 if (NumParts == 0 || NumParts >= Sz)
314 return bit_floor(Sz);
315 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
316 if (RegVF > Sz)
317 return bit_floor(Sz);
318 return (Sz / RegVF) * RegVF;
319}
320
321static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
322 SmallVectorImpl<int> &Mask) {
323 // The ShuffleBuilder implementation use shufflevector to splat an "element".
324 // But the element have different meaning for SLP (scalar) and REVEC
325 // (vector). We need to expand Mask into masks which shufflevector can use
326 // directly.
327 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
328 for (unsigned I : seq<unsigned>(Mask.size()))
329 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
330 I * VecTyNumElements, VecTyNumElements)))
331 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
332 : Mask[I] * VecTyNumElements + J;
333 Mask.swap(NewMask);
334}
335
336/// \returns the number of groups of shufflevector
337/// A group has the following features
338/// 1. All of value in a group are shufflevector.
339/// 2. The mask of all shufflevector is isExtractSubvectorMask.
340/// 3. The mask of all shufflevector uses all of the elements of the source.
341/// e.g., it is 1 group (%0)
342/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
343/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
344/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
345/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
346/// it is 2 groups (%3 and %4)
347/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
348/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
349/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
350/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
351/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
352/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
353/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
354/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
355/// it is 0 group
356/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
357/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
358/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
359/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
361 if (VL.empty())
362 return 0;
364 return 0;
365 auto *SV = cast<ShuffleVectorInst>(VL.front());
366 unsigned SVNumElements =
367 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
370 return 0;
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
373 return 0;
374 unsigned NumGroup = 0;
375 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
376 auto *SV = cast<ShuffleVectorInst>(VL[I]);
377 Value *Src = SV->getOperand(0);
378 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
379 SmallBitVector ExpectedIndex(GroupSize);
380 if (!all_of(Group, [&](Value *V) {
381 auto *SV = cast<ShuffleVectorInst>(V);
382 // From the same source.
383 if (SV->getOperand(0) != Src)
384 return false;
385 int Index;
386 if (!SV->isExtractSubvectorMask(Index))
387 return false;
388 ExpectedIndex.set(Index / ShuffleMaskSize);
389 return true;
390 }))
391 return 0;
392 if (!ExpectedIndex.all())
393 return 0;
394 ++NumGroup;
395 }
396 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
397 return NumGroup;
398}
399
400/// \returns a shufflevector mask which is used to vectorize shufflevectors
401/// e.g.,
402/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
403/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
404/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
405/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
406/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
407/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
409/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
410/// the result is
411/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
413 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
414 auto *SV = cast<ShuffleVectorInst>(VL.front());
415 unsigned SVNumElements =
416 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
417 SmallVector<int> Mask;
418 unsigned AccumulateLength = 0;
419 for (Value *V : VL) {
420 auto *SV = cast<ShuffleVectorInst>(V);
421 for (int M : SV->getShuffleMask())
422 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
425 }
426 return Mask;
427}
428
429/// \returns True if the value is a constant (but not globals/constant
430/// expressions).
431static bool isConstant(Value *V) {
433}
434
435/// Checks if \p V is one of vector-like instructions, i.e. undef,
436/// insertelement/extractelement with constant indices for fixed vector type or
437/// extractvalue instruction.
441 return false;
442 auto *I = dyn_cast<Instruction>(V);
443 if (!I || isa<ExtractValueInst>(I))
444 return true;
445 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
446 return false;
448 return isConstant(I->getOperand(1));
449 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
450 return isConstant(I->getOperand(2));
451}
452
453/// Returns power-of-2 number of elements in a single register (part), given the
454/// total number of elements \p Size and number of registers (parts) \p
455/// NumParts.
456static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
457 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
458}
459
460/// Returns correct remaining number of elements, considering total amount \p
461/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
462/// and current register (part) \p Part.
463static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
464 unsigned Part) {
465 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
466}
467
468#if !defined(NDEBUG)
469/// Print a short descriptor of the instruction bundle suitable for debug output.
470static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
471 std::string Result;
472 raw_string_ostream OS(Result);
473 if (Idx >= 0)
474 OS << "Idx: " << Idx << ", ";
475 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
476 return Result;
477}
478#endif
479
480/// \returns true if all of the instructions in \p VL are in the same block or
481/// false otherwise.
483 auto *It = find_if(VL, IsaPred<Instruction>);
484 if (It == VL.end())
485 return false;
488 return true;
489
490 BasicBlock *BB = I0->getParent();
491 for (Value *V : iterator_range(It, VL.end())) {
492 if (isa<PoisonValue>(V))
493 continue;
494 auto *II = dyn_cast<Instruction>(V);
495 if (!II)
496 return false;
497
498 if (BB != II->getParent())
499 return false;
500 }
501 return true;
502}
503
504/// \returns True if all of the values in \p VL are constants (but not
505/// globals/constant expressions).
507 // Constant expressions and globals can't be vectorized like normal integer/FP
508 // constants.
509 return all_of(VL, isConstant);
510}
511
512/// \returns True if all of the values in \p VL are identical or some of them
513/// are UndefValue.
514static bool isSplat(ArrayRef<Value *> VL) {
515 Value *FirstNonUndef = nullptr;
516 for (Value *V : VL) {
517 if (isa<UndefValue>(V))
518 continue;
519 if (!FirstNonUndef) {
520 FirstNonUndef = V;
521 continue;
522 }
523 if (V != FirstNonUndef)
524 return false;
525 }
526 return FirstNonUndef != nullptr;
527}
528
529/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
530/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
531/// patterns that make it effectively commutative (like equality comparisons
532/// with zero).
533/// In most cases, users should not call this function directly (since \p I and
534/// \p InstWithUses are the same). However, when analyzing interchangeable
535/// instructions, we need to use the converted opcode along with the original
536/// uses.
537/// \param I The instruction to check for commutativity
538/// \param ValWithUses The value whose uses are analyzed for special
539/// patterns
540static bool isCommutative(Instruction *I, Value *ValWithUses) {
541 if (auto *Cmp = dyn_cast<CmpInst>(I))
542 return Cmp->isCommutative();
543 if (auto *BO = dyn_cast<BinaryOperator>(I))
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
546 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
547 all_of(
548 ValWithUses->uses(),
549 [](const Use &U) {
550 // Commutative, if icmp eq/ne sub, 0
551 CmpPredicate Pred;
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
555 return true;
556 // Commutative, if abs(sub nsw, true) or abs(sub, false).
557 ConstantInt *Flag;
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
562 Flag->isOne());
563 })) ||
564 (BO->getOpcode() == Instruction::FSub &&
565 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
566 all_of(ValWithUses->uses(), [](const Use &U) {
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
569 }));
570 return I->isCommutative();
571}
572
573/// This is a helper function to check whether \p I is commutative.
574/// This is a convenience wrapper that calls the two-parameter version of
575/// isCommutative with the same instruction for both parameters. This is
576/// the common case where the instruction being checked for commutativity
577/// is the same as the instruction whose uses are analyzed for special
578/// patterns (see the two-parameter version above for details).
579/// \param I The instruction to check for commutativity
580/// \returns true if the instruction is commutative, false otherwise
581static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
582
583/// \returns number of operands of \p I, considering commutativity. Returns 2
584/// for commutative instrinsics.
585/// \param I The instruction to check for commutativity
588 // IntrinsicInst::isCommutative returns true if swapping the first "two"
589 // arguments to the intrinsic produces the same result.
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
592 }
593 return I->getNumOperands();
594}
595
596template <typename T>
597static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
598 unsigned Offset) {
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
601 "unsupported T");
602 int Index = Offset;
603 if (const auto *IE = dyn_cast<T>(Inst)) {
604 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
605 if (!VT)
606 return std::nullopt;
607 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
608 if (!CI)
609 return std::nullopt;
610 if (CI->getValue().uge(VT->getNumElements()))
611 return std::nullopt;
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
614 return Index;
615 }
616 return std::nullopt;
617}
618
619/// \returns inserting or extracting index of InsertElement, ExtractElement or
620/// InsertValue instruction, using Offset as base offset for index.
621/// \returns std::nullopt if the index is not an immediate.
622static std::optional<unsigned> getElementIndex(const Value *Inst,
623 unsigned Offset = 0) {
624 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
625 return Index;
627 return Index;
628
629 int Index = Offset;
630
631 const auto *IV = dyn_cast<InsertValueInst>(Inst);
632 if (!IV)
633 return std::nullopt;
634
635 Type *CurrentType = IV->getType();
636 for (unsigned I : IV->indices()) {
637 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(I);
640 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
643 } else {
644 return std::nullopt;
645 }
646 Index += I;
647 }
648 return Index;
649}
650
651/// \returns true if all of the values in \p VL use the same opcode.
652/// For comparison instructions, also checks if predicates match.
653/// PoisonValues are considered matching.
654/// Interchangeable instructions are not considered.
656 auto *It = find_if(VL, IsaPred<Instruction>);
657 if (It == VL.end())
658 return true;
659 Instruction *MainOp = cast<Instruction>(*It);
660 unsigned Opcode = MainOp->getOpcode();
661 bool IsCmpOp = isa<CmpInst>(MainOp);
662 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
664 return std::all_of(It, VL.end(), [&](Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
670 });
671}
672
673namespace {
674/// Specifies the way the mask should be analyzed for undefs/poisonous elements
675/// in the shuffle mask.
676enum class UseMask {
677 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
678 ///< check for the mask elements for the first argument (mask
679 ///< indices are in range [0:VF)).
680 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
681 ///< for the mask elements for the second argument (mask indices
682 ///< are in range [VF:2*VF))
683 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
684 ///< future shuffle elements and mark them as ones as being used
685 ///< in future. Non-undef elements are considered as unused since
686 ///< they're already marked as used in the mask.
687};
688} // namespace
689
690/// Prepares a use bitset for the given mask either for the first argument or
691/// for the second.
693 UseMask MaskArg) {
694 SmallBitVector UseMask(VF, true);
695 for (auto [Idx, Value] : enumerate(Mask)) {
696 if (Value == PoisonMaskElem) {
697 if (MaskArg == UseMask::UndefsAsMask)
698 UseMask.reset(Idx);
699 continue;
700 }
701 if (MaskArg == UseMask::FirstArg && Value < VF)
702 UseMask.reset(Value);
703 else if (MaskArg == UseMask::SecondArg && Value >= VF)
704 UseMask.reset(Value - VF);
705 }
706 return UseMask;
707}
708
709/// Checks if the given value is actually an undefined constant vector.
710/// Also, if the \p UseMask is not empty, tries to check if the non-masked
711/// elements actually mask the insertelement buildvector, if any.
712template <bool IsPoisonOnly = false>
714 const SmallBitVector &UseMask = {}) {
715 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
717 if (isa<T>(V))
718 return Res;
719 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
720 if (!VecTy)
721 return Res.reset();
722 auto *C = dyn_cast<Constant>(V);
723 if (!C) {
724 if (!UseMask.empty()) {
725 const Value *Base = V;
726 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
727 Base = II->getOperand(0);
728 if (isa<T>(II->getOperand(1)))
729 continue;
730 std::optional<unsigned> Idx = getElementIndex(II);
731 if (!Idx) {
732 Res.reset();
733 return Res;
734 }
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
736 Res.reset(*Idx);
737 }
738 // TODO: Add analysis for shuffles here too.
739 if (V == Base) {
740 Res.reset();
741 } else {
742 SmallBitVector SubMask(UseMask.size(), false);
743 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
744 }
745 } else {
746 Res.reset();
747 }
748 return Res;
749 }
750 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
751 if (Constant *Elem = C->getAggregateElement(I))
752 if (!isa<T>(Elem) &&
753 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
754 Res.reset(I);
755 }
756 return Res;
757}
758
759/// Checks if the vector of instructions can be represented as a shuffle, like:
760/// %x0 = extractelement <4 x i8> %x, i32 0
761/// %x3 = extractelement <4 x i8> %x, i32 3
762/// %y1 = extractelement <4 x i8> %y, i32 1
763/// %y2 = extractelement <4 x i8> %y, i32 2
764/// %x0x0 = mul i8 %x0, %x0
765/// %x3x3 = mul i8 %x3, %x3
766/// %y1y1 = mul i8 %y1, %y1
767/// %y2y2 = mul i8 %y2, %y2
768/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
769/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
770/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
771/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
772/// ret <4 x i8> %ins4
773/// can be transformed into:
774/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
775/// i32 6>
776/// %2 = mul <4 x i8> %1, %1
777/// ret <4 x i8> %2
778/// Mask will return the Shuffle Mask equivalent to the extracted elements.
779/// TODO: Can we split off and reuse the shuffle mask detection from
780/// ShuffleVectorInst/getShuffleCost?
781static std::optional<TargetTransformInfo::ShuffleKind>
783 AssumptionCache *AC) {
784 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
785 if (It == VL.end())
786 return std::nullopt;
787 unsigned Size =
788 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
790 if (!EI)
791 return S;
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
793 if (!VTy)
794 return S;
795 return std::max(S, VTy->getNumElements());
796 });
797
798 Value *Vec1 = nullptr;
799 Value *Vec2 = nullptr;
800 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
801 auto *EE = dyn_cast<ExtractElementInst>(V);
802 if (!EE)
803 return false;
804 Value *Vec = EE->getVectorOperand();
805 if (isa<UndefValue>(Vec))
806 return false;
807 return isGuaranteedNotToBePoison(Vec, AC);
808 });
809 enum ShuffleMode { Unknown, Select, Permute };
810 ShuffleMode CommonShuffleMode = Unknown;
811 Mask.assign(VL.size(), PoisonMaskElem);
812 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
813 // Undef can be represented as an undef element in a vector.
814 if (isa<UndefValue>(VL[I]))
815 continue;
816 auto *EI = cast<ExtractElementInst>(VL[I]);
817 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
818 return std::nullopt;
819 auto *Vec = EI->getVectorOperand();
820 // We can extractelement from undef or poison vector.
822 continue;
823 // All vector operands must have the same number of vector elements.
824 if (isa<UndefValue>(Vec)) {
825 Mask[I] = I;
826 } else {
827 if (isa<UndefValue>(EI->getIndexOperand()))
828 continue;
829 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
830 if (!Idx)
831 return std::nullopt;
832 // Undefined behavior if Idx is negative or >= Size.
833 if (Idx->getValue().uge(Size))
834 continue;
835 unsigned IntIdx = Idx->getValue().getZExtValue();
836 Mask[I] = IntIdx;
837 }
838 if (isUndefVector(Vec).all() && HasNonUndefVec)
839 continue;
840 // For correct shuffling we have to have at most 2 different vector operands
841 // in all extractelement instructions.
842 if (!Vec1 || Vec1 == Vec) {
843 Vec1 = Vec;
844 } else if (!Vec2 || Vec2 == Vec) {
845 Vec2 = Vec;
846 Mask[I] += Size;
847 } else {
848 return std::nullopt;
849 }
850 if (CommonShuffleMode == Permute)
851 continue;
852 // If the extract index is not the same as the operation number, it is a
853 // permutation.
854 if (Mask[I] % Size != I) {
855 CommonShuffleMode = Permute;
856 continue;
857 }
858 CommonShuffleMode = Select;
859 }
860 // If we're not crossing lanes in different vectors, consider it as blending.
861 if (CommonShuffleMode == Select && Vec2)
863 // If Vec2 was never used, we have a permutation of a single vector, otherwise
864 // we have permutation of 2 vectors.
867}
868
869/// \returns True if Extract{Value,Element} instruction extracts element Idx.
870static std::optional<unsigned> getExtractIndex(const Instruction *E) {
871 unsigned Opcode = E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
876 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
877 if (!CI)
878 return std::nullopt;
879 return CI->getZExtValue();
880 }
881 auto *EI = cast<ExtractValueInst>(E);
882 if (EI->getNumIndices() != 1)
883 return std::nullopt;
884 return *EI->idx_begin();
885}
886
887namespace llvm {
888/// Checks if the provided value does not require scheduling. It does not
889/// require scheduling if this is not an instruction or it is an instruction
890/// that does not read/write memory and all operands are either not instructions
891/// or phi nodes or instructions from different blocks.
892static bool areAllOperandsNonInsts(Value *V);
893/// Checks if the provided value does not require scheduling. It does not
894/// require scheduling if this is not an instruction or it is an instruction
895/// that does not read/write memory and all users are phi nodes or instructions
896/// from the different blocks.
897static bool isUsedOutsideBlock(Value *V);
898/// Checks if the specified value does not require scheduling. It does not
899/// require scheduling if all operands and all users do not need to be scheduled
900/// in the current basic block.
901static bool doesNotNeedToBeScheduled(Value *V);
902} // namespace llvm
903
904namespace {
905/// \returns true if \p Opcode is allowed as part of the main/alternate
906/// instruction for SLP vectorization.
907///
908/// Example of unsupported opcode is SDIV that can potentially cause UB if the
909/// "shuffled out" lane would result in division by zero.
910bool isValidForAlternation(unsigned Opcode) {
911 return !Instruction::isIntDivRem(Opcode);
912}
913
914/// Helper class that determines VL can use the same opcode.
915/// Alternate instruction is supported. In addition, it supports interchangeable
916/// instruction. An interchangeable instruction is an instruction that can be
917/// converted to another instruction with same semantics. For example, x << 1 is
918/// equal to x * 2. x * 1 is equal to x | 0.
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
921 /// Sort SupportedOp because it is used by binary_search.
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
925 enum : MaskType {
926 ShlBIT = 0b1,
927 AShrBIT = 0b10,
928 MulBIT = 0b100,
929 AddBIT = 0b1000,
930 SubBIT = 0b10000,
931 AndBIT = 0b100000,
932 OrBIT = 0b1000000,
933 XorBIT = 0b10000000,
934 MainOpBIT = 0b100000000,
936 };
937 /// Return a non-nullptr if either operand of I is a ConstantInt.
938 /// The second return value represents the operand position. We check the
939 /// right-hand side first (1). If the right hand side is not a ConstantInt and
940 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
941 /// side (0).
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(const Instruction *I) {
944 unsigned Opcode = I->getOpcode();
945 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
946 (void)SupportedOp;
947 auto *BinOp = cast<BinaryOperator>(I);
948 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
949 return {CI, 1};
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
952 return {nullptr, 0};
953 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
954 return {CI, 0};
955 return {nullptr, 0};
956 }
957 struct InterchangeableInfo {
958 const Instruction *I = nullptr;
959 /// The bit it sets represents whether MainOp can be converted to.
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
962 /// We cannot create an interchangeable instruction that does not exist in
963 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
964 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
965 /// 1]. SeenBefore is used to know what operations have been seen before.
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(const Instruction *I) : I(I) {}
968 /// Return false allows BinOpSameOpcodeHelper to find an alternate
969 /// instruction. Directly setting the mask will destroy the mask state,
970 /// preventing us from determining which instruction it should convert to.
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
975 return true;
976 }
977 return false;
978 }
979 bool equal(unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
981 }
982 unsigned getOpcode() const {
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1002 llvm_unreachable("Cannot find interchangeable instruction.");
1003 }
1004
1005 /// Return true if the instruction can be converted to \p Opcode.
1006 bool hasCandidateOpcode(unsigned Opcode) const {
1007 MaskType Candidate = Mask & SeenBefore;
1008 switch (Opcode) {
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1035 return false;
1036 default:
1037 break;
1038 }
1039 llvm_unreachable("Cannot find interchangeable instruction.");
1040 }
1041
1042 SmallVector<Value *> getOperand(const Instruction *To) const {
1043 unsigned ToOpcode = To->getOpcode();
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1046 return SmallVector<Value *>(I->operands());
1047 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1051 APInt ToCIValue;
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1055 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1056 FromCIValue.getZExtValue());
1057 } else {
1058 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1060 ? APInt::getAllOnes(FromCIValueBitWidth)
1061 : APInt::getZero(FromCIValueBitWidth);
1062 }
1063 break;
1064 case Instruction::Mul:
1065 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1068 } else {
1069 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1071 ? APInt::getAllOnes(FromCIValueBitWidth)
1072 : APInt::getZero(FromCIValueBitWidth);
1073 }
1074 break;
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.isZero()) {
1078 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1079 } else {
1080 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1083 ToCIValue.negate();
1084 }
1085 break;
1086 case Instruction::And:
1087 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1088 ToCIValue = ToOpcode == Instruction::Mul
1089 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1090 : APInt::getZero(FromCIValueBitWidth);
1091 break;
1092 default:
1093 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1094 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1095 break;
1096 }
1097 Value *LHS = I->getOperand(1 - Pos);
1098 Constant *RHS =
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1100 // constant + x cannot be -constant - x
1101 // instead, it should be x - -constant
1102 if (Pos == 1 ||
1103 (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
1104 return SmallVector<Value *>({LHS, RHS});
1105 return SmallVector<Value *>({RHS, LHS});
1106 }
1107 };
1108 InterchangeableInfo MainOp;
1109 InterchangeableInfo AltOp;
1110 bool isValidForAlternation(const Instruction *I) const {
1111 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1112 ::isValidForAlternation(I->getOpcode());
1113 }
1114 bool initializeAltOp(const Instruction *I) {
1115 if (AltOp.I)
1116 return true;
1117 if (!isValidForAlternation(I))
1118 return false;
1119 AltOp.I = I;
1120 return true;
1121 }
1122
1123public:
1124 BinOpSameOpcodeHelper(const Instruction *MainOp,
1125 const Instruction *AltOp = nullptr)
1126 : MainOp(MainOp), AltOp(AltOp) {
1127 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1128 }
1129 bool add(const Instruction *I) {
1131 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1132 unsigned Opcode = I->getOpcode();
1133 MaskType OpcodeInMaskForm;
1134 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1135 switch (Opcode) {
1136 case Instruction::Shl:
1137 OpcodeInMaskForm = ShlBIT;
1138 break;
1139 case Instruction::AShr:
1140 OpcodeInMaskForm = AShrBIT;
1141 break;
1142 case Instruction::Mul:
1143 OpcodeInMaskForm = MulBIT;
1144 break;
1145 case Instruction::Add:
1146 OpcodeInMaskForm = AddBIT;
1147 break;
1148 case Instruction::Sub:
1149 OpcodeInMaskForm = SubBIT;
1150 break;
1151 case Instruction::And:
1152 OpcodeInMaskForm = AndBIT;
1153 break;
1154 case Instruction::Or:
1155 OpcodeInMaskForm = OrBIT;
1156 break;
1157 case Instruction::Xor:
1158 OpcodeInMaskForm = XorBIT;
1159 break;
1160 default:
1161 return MainOp.equal(Opcode) ||
1162 (initializeAltOp(I) && AltOp.equal(Opcode));
1163 }
1164 MaskType InterchangeableMask = OpcodeInMaskForm;
1165 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1166 if (CI) {
1167 constexpr MaskType CanBeAll =
1168 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1169 const APInt &CIValue = CI->getValue();
1170 switch (Opcode) {
1171 case Instruction::Shl:
1172 if (CIValue.ult(CIValue.getBitWidth()))
1173 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1174 break;
1175 case Instruction::Mul:
1176 if (CIValue.isOne()) {
1177 InterchangeableMask = CanBeAll;
1178 break;
1179 }
1180 if (CIValue.isPowerOf2())
1181 InterchangeableMask = MulBIT | ShlBIT;
1182 break;
1183 case Instruction::Add:
1184 case Instruction::Sub:
1185 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1186 break;
1187 case Instruction::And:
1188 if (CIValue.isAllOnes())
1189 InterchangeableMask = CanBeAll;
1190 break;
1191 default:
1192 if (CIValue.isZero())
1193 InterchangeableMask = CanBeAll;
1194 break;
1195 }
1196 }
1197 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1198 (initializeAltOp(I) &&
1199 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1200 }
1201 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1202 /// Checks if the list of potential opcodes includes \p Opcode.
1203 bool hasCandidateOpcode(unsigned Opcode) const {
1204 return MainOp.hasCandidateOpcode(Opcode);
1205 }
1206 bool hasAltOp() const { return AltOp.I; }
1207 unsigned getAltOpcode() const {
1208 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1209 }
1210 SmallVector<Value *> getOperand(const Instruction *I) const {
1211 return MainOp.getOperand(I);
1212 }
1213};
1214
1215/// Main data required for vectorization of instructions.
1216class InstructionsState {
1217 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1218 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1219 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1220 /// isAltShuffle).
1221 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1222 /// from getMainAltOpsNoStateVL.
1223 /// For those InstructionsState that use alternate instructions, the resulting
1224 /// vectorized output ultimately comes from a shufflevector. For example,
1225 /// given a vector list (VL):
1226 /// VL[0] = add i32 a, e
1227 /// VL[1] = sub i32 b, f
1228 /// VL[2] = add i32 c, g
1229 /// VL[3] = sub i32 d, h
1230 /// The vectorized result would be:
1231 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1232 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1233 /// result = shufflevector <4 x i32> intermediated_0,
1234 /// <4 x i32> intermediated_1,
1235 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1236 /// Since shufflevector is used in the final result, when calculating the cost
1237 /// (getEntryCost), we must account for the usage of shufflevector in
1238 /// GetVectorCost.
1239 Instruction *MainOp = nullptr;
1240 Instruction *AltOp = nullptr;
1241 /// Wether the instruction state represents copyable instructions.
1242 bool HasCopyables = false;
1243
1244public:
1245 Instruction *getMainOp() const {
1246 assert(valid() && "InstructionsState is invalid.");
1247 return MainOp;
1248 }
1249
1250 Instruction *getAltOp() const {
1251 assert(valid() && "InstructionsState is invalid.");
1252 return AltOp;
1253 }
1254
1255 /// The main/alternate opcodes for the list of instructions.
1256 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1257
1258 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1259
1260 /// Some of the instructions in the list have alternate opcodes.
1261 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1262
1263 /// Checks if the instruction matches either the main or alternate opcode.
1264 /// \returns
1265 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1266 /// to it
1267 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1268 /// it
1269 /// - nullptr if \param I cannot be matched or converted to either opcode
1270 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1271 assert(MainOp && "MainOp cannot be nullptr.");
1272 if (I->getOpcode() == MainOp->getOpcode())
1273 return MainOp;
1274 // Prefer AltOp instead of interchangeable instruction of MainOp.
1275 assert(AltOp && "AltOp cannot be nullptr.");
1276 if (I->getOpcode() == AltOp->getOpcode())
1277 return AltOp;
1278 if (!I->isBinaryOp())
1279 return nullptr;
1280 BinOpSameOpcodeHelper Converter(MainOp);
1281 if (!Converter.add(I) || !Converter.add(MainOp))
1282 return nullptr;
1283 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1284 BinOpSameOpcodeHelper AltConverter(AltOp);
1285 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1286 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1287 return AltOp;
1288 }
1289 if (Converter.hasAltOp() && !isAltShuffle())
1290 return nullptr;
1291 return Converter.hasAltOp() ? AltOp : MainOp;
1292 }
1293
1294 /// Checks if main/alt instructions are shift operations.
1295 bool isShiftOp() const {
1296 return getMainOp()->isShift() && getAltOp()->isShift();
1297 }
1298
1299 /// Checks if main/alt instructions are bitwise logic operations.
1300 bool isBitwiseLogicOp() const {
1301 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1302 }
1303
1304 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1305 bool isMulDivLikeOp() const {
1306 constexpr std::array<unsigned, 8> MulDiv = {
1307 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1308 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1309 Instruction::URem, Instruction::FRem};
1310 return is_contained(MulDiv, getOpcode()) &&
1311 is_contained(MulDiv, getAltOpcode());
1312 }
1313
1314 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1315 bool isAddSubLikeOp() const {
1316 constexpr std::array<unsigned, 4> AddSub = {
1317 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1318 Instruction::FSub};
1319 return is_contained(AddSub, getOpcode()) &&
1320 is_contained(AddSub, getAltOpcode());
1321 }
1322
1323 /// Checks if main/alt instructions are cmp operations.
1324 bool isCmpOp() const {
1325 return (getOpcode() == Instruction::ICmp ||
1326 getOpcode() == Instruction::FCmp) &&
1327 getAltOpcode() == getOpcode();
1328 }
1329
1330 /// Checks if the current state is valid, i.e. has non-null MainOp
1331 bool valid() const { return MainOp && AltOp; }
1332
1333 explicit operator bool() const { return valid(); }
1334
1335 InstructionsState() = delete;
1336 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1337 bool HasCopyables = false)
1338 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1339 static InstructionsState invalid() { return {nullptr, nullptr}; }
1340
1341 /// Checks if the value is a copyable element.
1342 bool isCopyableElement(Value *V) const {
1343 assert(valid() && "InstructionsState is invalid.");
1344 if (!HasCopyables)
1345 return false;
1346 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1347 return false;
1348 auto *I = dyn_cast<Instruction>(V);
1349 if (!I)
1350 return !isa<PoisonValue>(V);
1351 if (I->getParent() != MainOp->getParent() &&
1354 return true;
1355 if (I->getOpcode() == MainOp->getOpcode())
1356 return false;
1357 if (!I->isBinaryOp())
1358 return true;
1359 BinOpSameOpcodeHelper Converter(MainOp);
1360 return !Converter.add(I) || !Converter.add(MainOp) ||
1361 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1362 }
1363
1364 /// Checks if the value is non-schedulable.
1365 bool isNonSchedulable(Value *V) const {
1366 assert(valid() && "InstructionsState is invalid.");
1367 auto *I = dyn_cast<Instruction>(V);
1368 if (!HasCopyables)
1369 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1371 // MainOp for copyables always schedulable to correctly identify
1372 // non-schedulable copyables.
1373 if (getMainOp() == V)
1374 return false;
1375 if (isCopyableElement(V)) {
1376 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1377 auto *I = dyn_cast<Instruction>(V);
1378 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1380 // If the copyable instructions comes after MainOp
1381 // (non-schedulable, but used in the block) - cannot vectorize
1382 // it, will possibly generate use before def.
1383 !MainOp->comesBefore(I));
1384 };
1385
1386 return IsNonSchedulableCopyableElement(V);
1387 }
1388 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1390 }
1391
1392 /// Checks if the state represents copyable instructions.
1393 bool areInstructionsWithCopyableElements() const {
1394 assert(valid() && "InstructionsState is invalid.");
1395 return HasCopyables;
1396 }
1397};
1398
1399std::pair<Instruction *, SmallVector<Value *>>
1400convertTo(Instruction *I, const InstructionsState &S) {
1401 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1402 assert(SelectedOp && "Cannot convert the instruction.");
1403 if (I->isBinaryOp()) {
1404 BinOpSameOpcodeHelper Converter(I);
1405 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1406 }
1407 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1408}
1409
1410} // end anonymous namespace
1411
1412static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1413 const TargetLibraryInfo &TLI);
1414
1415/// Find an instruction with a specific opcode in VL.
1416/// \param VL Array of values to search through. Must contain only Instructions
1417/// and PoisonValues.
1418/// \param Opcode The instruction opcode to search for
1419/// \returns
1420/// - The first instruction found with matching opcode
1421/// - nullptr if no matching instruction is found
1423 unsigned Opcode) {
1424 for (Value *V : VL) {
1425 if (isa<PoisonValue>(V))
1426 continue;
1427 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1428 auto *Inst = cast<Instruction>(V);
1429 if (Inst->getOpcode() == Opcode)
1430 return Inst;
1431 }
1432 return nullptr;
1433}
1434
1435/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1436/// compatible instructions or constants, or just some other regular values.
1437static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1438 Value *Op1, const TargetLibraryInfo &TLI) {
1439 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1440 (isConstant(BaseOp1) && isConstant(Op1)) ||
1441 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1442 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1443 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1444 getSameOpcode({BaseOp0, Op0}, TLI) ||
1445 getSameOpcode({BaseOp1, Op1}, TLI);
1446}
1447
1448/// \returns true if a compare instruction \p CI has similar "look" and
1449/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1450/// swapped, false otherwise.
1451static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1452 const TargetLibraryInfo &TLI) {
1453 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1454 "Assessing comparisons of different types?");
1455 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1456 CmpInst::Predicate Pred = CI->getPredicate();
1458
1459 Value *BaseOp0 = BaseCI->getOperand(0);
1460 Value *BaseOp1 = BaseCI->getOperand(1);
1461 Value *Op0 = CI->getOperand(0);
1462 Value *Op1 = CI->getOperand(1);
1463
1464 return (BasePred == Pred &&
1465 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1466 (BasePred == SwappedPred &&
1467 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1468}
1469
1470/// \returns analysis of the Instructions in \p VL described in
1471/// InstructionsState, the Opcode that we suppose the whole list
1472/// could be vectorized even if its structure is diverse.
1473static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1474 const TargetLibraryInfo &TLI) {
1475 // Make sure these are all Instructions.
1477 return InstructionsState::invalid();
1478
1479 auto *It = find_if(VL, IsaPred<Instruction>);
1480 if (It == VL.end())
1481 return InstructionsState::invalid();
1482
1483 Instruction *MainOp = cast<Instruction>(*It);
1484 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1485 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1486 (VL.size() == 2 && InstCnt < 2))
1487 return InstructionsState::invalid();
1488
1489 bool IsCastOp = isa<CastInst>(MainOp);
1490 bool IsBinOp = isa<BinaryOperator>(MainOp);
1491 bool IsCmpOp = isa<CmpInst>(MainOp);
1492 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1494 Instruction *AltOp = MainOp;
1495 unsigned Opcode = MainOp->getOpcode();
1496 unsigned AltOpcode = Opcode;
1497
1498 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1499 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1500 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1501 UniquePreds.insert(BasePred);
1502 UniqueNonSwappedPreds.insert(BasePred);
1503 for (Value *V : VL) {
1504 auto *I = dyn_cast<CmpInst>(V);
1505 if (!I)
1506 return false;
1507 CmpInst::Predicate CurrentPred = I->getPredicate();
1508 CmpInst::Predicate SwappedCurrentPred =
1509 CmpInst::getSwappedPredicate(CurrentPred);
1510 UniqueNonSwappedPreds.insert(CurrentPred);
1511 if (!UniquePreds.contains(CurrentPred) &&
1512 !UniquePreds.contains(SwappedCurrentPred))
1513 UniquePreds.insert(CurrentPred);
1514 }
1515 // Total number of predicates > 2, but if consider swapped predicates
1516 // compatible only 2, consider swappable predicates as compatible opcodes,
1517 // not alternate.
1518 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1519 }();
1520 // Check for one alternate opcode from another BinaryOperator.
1521 // TODO - generalize to support all operators (types, calls etc.).
1522 Intrinsic::ID BaseID = 0;
1523 SmallVector<VFInfo> BaseMappings;
1524 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1525 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1526 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1527 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1528 return InstructionsState::invalid();
1529 }
1530 bool AnyPoison = InstCnt != VL.size();
1531 // Check MainOp too to be sure that it matches the requirements for the
1532 // instructions.
1533 for (Value *V : iterator_range(It, VL.end())) {
1534 auto *I = dyn_cast<Instruction>(V);
1535 if (!I)
1536 continue;
1537
1538 // Cannot combine poison and divisions.
1539 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1540 // intrinsics/functions only.
1541 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1542 return InstructionsState::invalid();
1543 unsigned InstOpcode = I->getOpcode();
1544 if (IsBinOp && isa<BinaryOperator>(I)) {
1545 if (BinOpHelper.add(I))
1546 continue;
1547 } else if (IsCastOp && isa<CastInst>(I)) {
1548 Value *Op0 = MainOp->getOperand(0);
1549 Type *Ty0 = Op0->getType();
1550 Value *Op1 = I->getOperand(0);
1551 Type *Ty1 = Op1->getType();
1552 if (Ty0 == Ty1) {
1553 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1554 continue;
1555 if (Opcode == AltOpcode) {
1556 assert(isValidForAlternation(Opcode) &&
1557 isValidForAlternation(InstOpcode) &&
1558 "Cast isn't safe for alternation, logic needs to be updated!");
1559 AltOpcode = InstOpcode;
1560 AltOp = I;
1561 continue;
1562 }
1563 }
1564 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1565 auto *BaseInst = cast<CmpInst>(MainOp);
1566 Type *Ty0 = BaseInst->getOperand(0)->getType();
1567 Type *Ty1 = Inst->getOperand(0)->getType();
1568 if (Ty0 == Ty1) {
1569 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1570 assert(InstOpcode == AltOpcode &&
1571 "Alternate instructions are only supported by BinaryOperator "
1572 "and CastInst.");
1573 // Check for compatible operands. If the corresponding operands are not
1574 // compatible - need to perform alternate vectorization.
1575 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1576 CmpInst::Predicate SwappedCurrentPred =
1577 CmpInst::getSwappedPredicate(CurrentPred);
1578
1579 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1580 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1581 continue;
1582
1583 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1584 continue;
1585 auto *AltInst = cast<CmpInst>(AltOp);
1586 if (MainOp != AltOp) {
1587 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1588 continue;
1589 } else if (BasePred != CurrentPred) {
1590 assert(
1591 isValidForAlternation(InstOpcode) &&
1592 "CmpInst isn't safe for alternation, logic needs to be updated!");
1593 AltOp = I;
1594 continue;
1595 }
1596 CmpInst::Predicate AltPred = AltInst->getPredicate();
1597 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1598 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1599 continue;
1600 }
1601 } else if (InstOpcode == Opcode) {
1602 assert(InstOpcode == AltOpcode &&
1603 "Alternate instructions are only supported by BinaryOperator and "
1604 "CastInst.");
1605 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1606 if (Gep->getNumOperands() != 2 ||
1607 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1608 return InstructionsState::invalid();
1609 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1611 return InstructionsState::invalid();
1612 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1613 auto *BaseLI = cast<LoadInst>(MainOp);
1614 if (!LI->isSimple() || !BaseLI->isSimple())
1615 return InstructionsState::invalid();
1616 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1617 auto *CallBase = cast<CallInst>(MainOp);
1618 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1619 return InstructionsState::invalid();
1620 if (Call->hasOperandBundles() &&
1622 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1623 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1624 CallBase->op_begin() +
1626 return InstructionsState::invalid();
1628 if (ID != BaseID)
1629 return InstructionsState::invalid();
1630 if (!ID) {
1631 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1632 if (Mappings.size() != BaseMappings.size() ||
1633 Mappings.front().ISA != BaseMappings.front().ISA ||
1634 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1635 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1636 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1637 Mappings.front().Shape.Parameters !=
1638 BaseMappings.front().Shape.Parameters)
1639 return InstructionsState::invalid();
1640 }
1641 }
1642 continue;
1643 }
1644 return InstructionsState::invalid();
1645 }
1646
1647 if (IsBinOp) {
1648 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1649 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1650 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1651 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1652 }
1653 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1654 "Incorrect implementation of allSameOpcode.");
1655 InstructionsState S(MainOp, AltOp);
1656 assert(all_of(VL,
1657 [&](Value *V) {
1658 return isa<PoisonValue>(V) ||
1659 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1660 }) &&
1661 "Invalid InstructionsState.");
1662 return S;
1663}
1664
1665/// \returns true if all of the values in \p VL have the same type or false
1666/// otherwise.
1668 Type *Ty = VL.consume_front()->getType();
1669 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1670}
1671
1672/// \returns True if in-tree use also needs extract. This refers to
1673/// possible scalar operand in vectorized instruction.
1674static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1675 TargetLibraryInfo *TLI,
1676 const TargetTransformInfo *TTI) {
1677 if (!UserInst)
1678 return false;
1679 unsigned Opcode = UserInst->getOpcode();
1680 switch (Opcode) {
1681 case Instruction::Load: {
1682 LoadInst *LI = cast<LoadInst>(UserInst);
1683 return (LI->getPointerOperand() == Scalar);
1684 }
1685 case Instruction::Store: {
1686 StoreInst *SI = cast<StoreInst>(UserInst);
1687 return (SI->getPointerOperand() == Scalar);
1688 }
1689 case Instruction::Call: {
1690 CallInst *CI = cast<CallInst>(UserInst);
1692 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1693 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1694 Arg.value().get() == Scalar;
1695 });
1696 }
1697 default:
1698 return false;
1699 }
1700}
1701
1702/// \returns the AA location that is being access by the instruction.
1705 return MemoryLocation::get(SI);
1706 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1707 return MemoryLocation::get(LI);
1708 return MemoryLocation();
1709}
1710
1711/// \returns True if the instruction is not a volatile or atomic load/store.
1712static bool isSimple(Instruction *I) {
1713 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1714 return LI->isSimple();
1716 return SI->isSimple();
1718 return !MI->isVolatile();
1719 return true;
1720}
1721
1722/// Shuffles \p Mask in accordance with the given \p SubMask.
1723/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1724/// one but two input vectors.
1725static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1726 bool ExtendingManyInputs = false) {
1727 if (SubMask.empty())
1728 return;
1729 assert(
1730 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1731 // Check if input scalars were extended to match the size of other node.
1732 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1733 "SubMask with many inputs support must be larger than the mask.");
1734 if (Mask.empty()) {
1735 Mask.append(SubMask.begin(), SubMask.end());
1736 return;
1737 }
1738 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1739 int TermValue = std::min(Mask.size(), SubMask.size());
1740 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1741 if (SubMask[I] == PoisonMaskElem ||
1742 (!ExtendingManyInputs &&
1743 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1744 continue;
1745 NewMask[I] = Mask[SubMask[I]];
1746 }
1747 Mask.swap(NewMask);
1748}
1749
1750/// Order may have elements assigned special value (size) which is out of
1751/// bounds. Such indices only appear on places which correspond to undef values
1752/// (see canReuseExtract for details) and used in order to avoid undef values
1753/// have effect on operands ordering.
1754/// The first loop below simply finds all unused indices and then the next loop
1755/// nest assigns these indices for undef values positions.
1756/// As an example below Order has two undef positions and they have assigned
1757/// values 3 and 7 respectively:
1758/// before: 6 9 5 4 9 2 1 0
1759/// after: 6 3 5 4 7 2 1 0
1761 const size_t Sz = Order.size();
1762 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1763 SmallBitVector MaskedIndices(Sz);
1764 for (unsigned I = 0; I < Sz; ++I) {
1765 if (Order[I] < Sz)
1766 UnusedIndices.reset(Order[I]);
1767 else
1768 MaskedIndices.set(I);
1769 }
1770 if (MaskedIndices.none())
1771 return;
1772 assert(UnusedIndices.count() == MaskedIndices.count() &&
1773 "Non-synced masked/available indices.");
1774 int Idx = UnusedIndices.find_first();
1775 int MIdx = MaskedIndices.find_first();
1776 while (MIdx >= 0) {
1777 assert(Idx >= 0 && "Indices must be synced.");
1778 Order[MIdx] = Idx;
1779 Idx = UnusedIndices.find_next(Idx);
1780 MIdx = MaskedIndices.find_next(MIdx);
1781 }
1782}
1783
1784/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1785/// Opcode1.
1787 unsigned Opcode0, unsigned Opcode1) {
1788 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1789 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1790 for (unsigned Lane : seq<unsigned>(VL.size())) {
1791 if (isa<PoisonValue>(VL[Lane]))
1792 continue;
1793 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1794 OpcodeMask.set(Lane * ScalarTyNumElements,
1795 Lane * ScalarTyNumElements + ScalarTyNumElements);
1796 }
1797 return OpcodeMask;
1798}
1799
1800/// Replicates the given \p Val \p VF times.
1802 unsigned VF) {
1803 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1804 "Expected scalar constants.");
1805 SmallVector<Constant *> NewVal(Val.size() * VF);
1806 for (auto [I, V] : enumerate(Val))
1807 std::fill_n(NewVal.begin() + I * VF, VF, V);
1808 return NewVal;
1809}
1810
1811namespace llvm {
1812
1814 SmallVectorImpl<int> &Mask) {
1815 Mask.clear();
1816 const unsigned E = Indices.size();
1817 Mask.resize(E, PoisonMaskElem);
1818 for (unsigned I = 0; I < E; ++I)
1819 Mask[Indices[I]] = I;
1820}
1821
1822/// Reorders the list of scalars in accordance with the given \p Mask.
1824 ArrayRef<int> Mask) {
1825 assert(!Mask.empty() && "Expected non-empty mask.");
1826 SmallVector<Value *> Prev(Scalars.size(),
1827 PoisonValue::get(Scalars.front()->getType()));
1828 Prev.swap(Scalars);
1829 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1830 if (Mask[I] != PoisonMaskElem)
1831 Scalars[Mask[I]] = Prev[I];
1832}
1833
1834/// Checks if the provided value does not require scheduling. It does not
1835/// require scheduling if this is not an instruction or it is an instruction
1836/// that does not read/write memory and all operands are either not instructions
1837/// or phi nodes or instructions from different blocks.
1839 auto *I = dyn_cast<Instruction>(V);
1840 if (!I)
1841 return true;
1842 return !mayHaveNonDefUseDependency(*I) &&
1843 all_of(I->operands(), [I](Value *V) {
1844 auto *IO = dyn_cast<Instruction>(V);
1845 if (!IO)
1846 return true;
1847 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1848 });
1849}
1850
1851/// Checks if the provided value does not require scheduling. It does not
1852/// require scheduling if this is not an instruction or it is an instruction
1853/// that does not read/write memory and all users are phi nodes or instructions
1854/// from the different blocks.
1855static bool isUsedOutsideBlock(Value *V) {
1856 auto *I = dyn_cast<Instruction>(V);
1857 if (!I)
1858 return true;
1859 // Limits the number of uses to save compile time.
1860 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1861 all_of(I->users(), [I](User *U) {
1862 auto *IU = dyn_cast<Instruction>(U);
1863 if (!IU)
1864 return true;
1865 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1866 });
1867}
1868
1869/// Checks if the specified value does not require scheduling. It does not
1870/// require scheduling if all operands and all users do not need to be scheduled
1871/// in the current basic block.
1874}
1875
1876/// Checks if the specified array of instructions does not require scheduling.
1877/// It is so if all either instructions have operands that do not require
1878/// scheduling or their users do not require scheduling since they are phis or
1879/// in other basic blocks.
1881 return !VL.empty() &&
1883}
1884
1885/// Returns true if widened type of \p Ty elements with size \p Sz represents
1886/// full vector type, i.e. adding extra element results in extra parts upon type
1887/// legalization.
1889 unsigned Sz) {
1890 if (Sz <= 1)
1891 return false;
1893 return false;
1894 if (has_single_bit(Sz))
1895 return true;
1896 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1897 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1898 Sz % NumParts == 0;
1899}
1900
1901/// Returns number of parts, the type \p VecTy will be split at the codegen
1902/// phase. If the type is going to be scalarized or does not uses whole
1903/// registers, returns 1.
1904static unsigned
1906 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1907 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1908 if (NumParts == 0 || NumParts >= Limit)
1909 return 1;
1910 unsigned Sz = getNumElements(VecTy);
1911 if (NumParts >= Sz || Sz % NumParts != 0 ||
1912 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1913 return 1;
1914 return NumParts;
1915}
1916
1917namespace slpvectorizer {
1918
1919/// Bottom Up SLP Vectorizer.
1920class BoUpSLP {
1921 class TreeEntry;
1922 class ScheduleEntity;
1923 class ScheduleData;
1924 class ScheduleCopyableData;
1925 class ScheduleBundle;
1928
1929 /// If we decide to generate strided load / store, this struct contains all
1930 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1931 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1932 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1933 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1934 /// size of element of FixedVectorType.
1935 struct StridedPtrInfo {
1936 Value *StrideVal = nullptr;
1937 const SCEV *StrideSCEV = nullptr;
1938 FixedVectorType *Ty = nullptr;
1939 };
1940 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1941
1942public:
1943 /// Tracks the state we can represent the loads in the given sequence.
1951
1958
1960 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1962 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1963 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1964 AC(AC), DB(DB), DL(DL), ORE(ORE),
1965 Builder(Se->getContext(), TargetFolder(*DL)) {
1966 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1967 // Use the vector register size specified by the target unless overridden
1968 // by a command-line option.
1969 // TODO: It would be better to limit the vectorization factor based on
1970 // data type rather than just register size. For example, x86 AVX has
1971 // 256-bit registers, but it does not support integer operations
1972 // at that width (that requires AVX2).
1973 if (MaxVectorRegSizeOption.getNumOccurrences())
1974 MaxVecRegSize = MaxVectorRegSizeOption;
1975 else
1976 MaxVecRegSize =
1977 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1978 .getFixedValue();
1979
1980 if (MinVectorRegSizeOption.getNumOccurrences())
1981 MinVecRegSize = MinVectorRegSizeOption;
1982 else
1983 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1984 }
1985
1986 /// Vectorize the tree that starts with the elements in \p VL.
1987 /// Returns the vectorized root.
1989
1990 /// Vectorize the tree but with the list of externally used values \p
1991 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1992 /// generated extractvalue instructions.
1994 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1995 Instruction *ReductionRoot = nullptr,
1996 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
1997
1998 /// \returns the cost incurred by unwanted spills and fills, caused by
1999 /// holding live values over call sites.
2001
2002 /// \returns the vectorization cost of the subtree that starts at \p VL.
2003 /// A negative number means that this is profitable.
2004 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
2005 InstructionCost ReductionCost = TTI::TCC_Free);
2006
2007 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2008 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2009 void buildTree(ArrayRef<Value *> Roots,
2010 const SmallDenseSet<Value *> &UserIgnoreLst);
2011
2012 /// Construct a vectorizable tree that starts at \p Roots.
2013 void buildTree(ArrayRef<Value *> Roots);
2014
2015 /// Return the scalars of the root node.
2017 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2018 return VectorizableTree.front()->Scalars;
2019 }
2020
2021 /// Returns the type/is-signed info for the root node in the graph without
2022 /// casting.
2023 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2024 const TreeEntry &Root = *VectorizableTree.front();
2025 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2026 !Root.Scalars.front()->getType()->isIntegerTy())
2027 return std::nullopt;
2028 auto It = MinBWs.find(&Root);
2029 if (It != MinBWs.end())
2030 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2031 It->second.first),
2032 It->second.second);
2033 if (Root.getOpcode() == Instruction::ZExt ||
2034 Root.getOpcode() == Instruction::SExt)
2035 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2036 Root.getOpcode() == Instruction::SExt);
2037 return std::nullopt;
2038 }
2039
2040 /// Checks if the root graph node can be emitted with narrower bitwidth at
2041 /// codegen and returns it signedness, if so.
2043 return MinBWs.at(VectorizableTree.front().get()).second;
2044 }
2045
2046 /// Returns reduction type after minbitdth analysis.
2048 if (ReductionBitWidth == 0 ||
2049 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2050 ReductionBitWidth >=
2051 DL->getTypeSizeInBits(
2052 VectorizableTree.front()->Scalars.front()->getType()))
2053 return getWidenedType(
2054 VectorizableTree.front()->Scalars.front()->getType(),
2055 VectorizableTree.front()->getVectorFactor());
2056 return getWidenedType(
2058 VectorizableTree.front()->Scalars.front()->getContext(),
2059 ReductionBitWidth),
2060 VectorizableTree.front()->getVectorFactor());
2061 }
2062
2063 /// Builds external uses of the vectorized scalars, i.e. the list of
2064 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2065 /// ExternallyUsedValues contains additional list of external uses to handle
2066 /// vectorization of reductions.
2067 void
2068 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2069
2070 /// Transforms graph nodes to target specific representations, if profitable.
2071 void transformNodes();
2072
2073 /// Clear the internal data structures that are created by 'buildTree'.
2074 void deleteTree() {
2075 VectorizableTree.clear();
2076 ScalarToTreeEntries.clear();
2077 OperandsToTreeEntry.clear();
2078 ScalarsInSplitNodes.clear();
2079 MustGather.clear();
2080 NonScheduledFirst.clear();
2081 EntryToLastInstruction.clear();
2082 LoadEntriesToVectorize.clear();
2083 IsGraphTransformMode = false;
2084 GatheredLoadsEntriesFirst.reset();
2085 CompressEntryToData.clear();
2086 ExternalUses.clear();
2087 ExternalUsesAsOriginalScalar.clear();
2088 ExternalUsesWithNonUsers.clear();
2089 for (auto &Iter : BlocksSchedules) {
2090 BlockScheduling *BS = Iter.second.get();
2091 BS->clear();
2092 }
2093 MinBWs.clear();
2094 ReductionBitWidth = 0;
2095 BaseGraphSize = 1;
2096 CastMaxMinBWSizes.reset();
2097 ExtraBitWidthNodes.clear();
2098 InstrElementSize.clear();
2099 UserIgnoreList = nullptr;
2100 PostponedGathers.clear();
2101 ValueToGatherNodes.clear();
2102 }
2103
2104 unsigned getTreeSize() const { return VectorizableTree.size(); }
2105
2106 /// Returns the base graph size, before any transformations.
2107 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2108
2109 /// Perform LICM and CSE on the newly generated gather sequences.
2111
2112 /// Does this non-empty order represent an identity order? Identity
2113 /// should be represented as an empty order, so this is used to
2114 /// decide if we can canonicalize a computed order. Undef elements
2115 /// (represented as size) are ignored.
2117 assert(!Order.empty() && "expected non-empty order");
2118 const unsigned Sz = Order.size();
2119 return all_of(enumerate(Order), [&](const auto &P) {
2120 return P.value() == P.index() || P.value() == Sz;
2121 });
2122 }
2123
2124 /// Checks if the specified gather tree entry \p TE can be represented as a
2125 /// shuffled vector entry + (possibly) permutation with other gathers. It
2126 /// implements the checks only for possibly ordered scalars (Loads,
2127 /// ExtractElement, ExtractValue), which can be part of the graph.
2128 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2129 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2130 /// node might be ignored.
2131 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2132 bool TopToBottom,
2133 bool IgnoreReorder);
2134
2135 /// Sort loads into increasing pointers offsets to allow greater clustering.
2136 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2137
2138 /// Gets reordering data for the given tree entry. If the entry is vectorized
2139 /// - just return ReorderIndices, otherwise check if the scalars can be
2140 /// reordered and return the most optimal order.
2141 /// \return std::nullopt if ordering is not important, empty order, if
2142 /// identity order is important, or the actual order.
2143 /// \param TopToBottom If true, include the order of vectorized stores and
2144 /// insertelement nodes, otherwise skip them.
2145 /// \param IgnoreReorder true, if the root node order can be ignored.
2146 std::optional<OrdersType>
2147 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2148
2149 /// Checks if it is profitable to reorder the current tree.
2150 /// If the tree does not contain many profitable reordable nodes, better to
2151 /// skip it to save compile time.
2152 bool isProfitableToReorder() const;
2153
2154 /// Reorders the current graph to the most profitable order starting from the
2155 /// root node to the leaf nodes. The best order is chosen only from the nodes
2156 /// of the same size (vectorization factor). Smaller nodes are considered
2157 /// parts of subgraph with smaller VF and they are reordered independently. We
2158 /// can make it because we still need to extend smaller nodes to the wider VF
2159 /// and we can merge reordering shuffles with the widening shuffles.
2160 void reorderTopToBottom();
2161
2162 /// Reorders the current graph to the most profitable order starting from
2163 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2164 /// number of reshuffles if the leaf nodes use the same order. In this case we
2165 /// can merge the orders and just shuffle user node instead of shuffling its
2166 /// operands. Plus, even the leaf nodes have different orders, it allows to
2167 /// sink reordering in the graph closer to the root node and merge it later
2168 /// during analysis.
2169 void reorderBottomToTop(bool IgnoreReorder = false);
2170
2171 /// \return The vector element size in bits to use when vectorizing the
2172 /// expression tree ending at \p V. If V is a store, the size is the width of
2173 /// the stored value. Otherwise, the size is the width of the largest loaded
2174 /// value reaching V. This method is used by the vectorizer to calculate
2175 /// vectorization factors.
2176 unsigned getVectorElementSize(Value *V);
2177
2178 /// Compute the minimum type sizes required to represent the entries in a
2179 /// vectorizable tree.
2181
2182 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2183 unsigned getMaxVecRegSize() const {
2184 return MaxVecRegSize;
2185 }
2186
2187 // \returns minimum vector register size as set by cl::opt.
2188 unsigned getMinVecRegSize() const {
2189 return MinVecRegSize;
2190 }
2191
2192 unsigned getMinVF(unsigned Sz) const {
2193 return std::max(2U, getMinVecRegSize() / Sz);
2194 }
2195
2196 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2197 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2198 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2199 return MaxVF ? MaxVF : UINT_MAX;
2200 }
2201
2202 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2203 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2204 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2205 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2206 ///
2207 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2208 unsigned canMapToVector(Type *T) const;
2209
2210 /// \returns True if the VectorizableTree is both tiny and not fully
2211 /// vectorizable. We do not vectorize such trees.
2212 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2213
2214 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2215 /// It may happen, if all gather nodes are loads and they cannot be
2216 /// "clusterized". In this case even subgraphs cannot be vectorized more
2217 /// effectively than the base graph.
2218 bool isTreeNotExtendable() const;
2219
2220 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2221 /// can be load combined in the backend. Load combining may not be allowed in
2222 /// the IR optimizer, so we do not want to alter the pattern. For example,
2223 /// partially transforming a scalar bswap() pattern into vector code is
2224 /// effectively impossible for the backend to undo.
2225 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2226 /// may not be necessary.
2227 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2228
2229 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2230 /// can be load combined in the backend. Load combining may not be allowed in
2231 /// the IR optimizer, so we do not want to alter the pattern. For example,
2232 /// partially transforming a scalar bswap() pattern into vector code is
2233 /// effectively impossible for the backend to undo.
2234 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2235 /// may not be necessary.
2236 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2239 const DataLayout &DL, ScalarEvolution &SE,
2240 const int64_t Diff, StridedPtrInfo &SPtrInfo) const;
2241
2242 /// Checks if the given array of loads can be represented as a vectorized,
2243 /// scatter or just simple gather.
2244 /// \param VL list of loads.
2245 /// \param VL0 main load value.
2246 /// \param Order returned order of load instructions.
2247 /// \param PointerOps returned list of pointer operands.
2248 /// \param BestVF return best vector factor, if recursive check found better
2249 /// vectorization sequences rather than masked gather.
2250 /// \param TryRecursiveCheck used to check if long masked gather can be
2251 /// represented as a serie of loads/insert subvector, if profitable.
2254 SmallVectorImpl<Value *> &PointerOps,
2255 StridedPtrInfo &SPtrInfo,
2256 unsigned *BestVF = nullptr,
2257 bool TryRecursiveCheck = true) const;
2258
2259 /// Registers non-vectorizable sequence of loads
2260 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2261 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2262 }
2263
2264 /// Checks if the given loads sequence is known as not vectorizable
2265 template <typename T>
2267 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2268 }
2269
2271
2272 /// This structure holds any data we need about the edges being traversed
2273 /// during buildTreeRec(). We keep track of:
2274 /// (i) the user TreeEntry index, and
2275 /// (ii) the index of the edge.
2276 struct EdgeInfo {
2277 EdgeInfo() = default;
2278 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2280 /// The user TreeEntry.
2281 TreeEntry *UserTE = nullptr;
2282 /// The operand index of the use.
2283 unsigned EdgeIdx = UINT_MAX;
2284#ifndef NDEBUG
2286 const BoUpSLP::EdgeInfo &EI) {
2287 EI.dump(OS);
2288 return OS;
2289 }
2290 /// Debug print.
2291 void dump(raw_ostream &OS) const {
2292 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2293 << " EdgeIdx:" << EdgeIdx << "}";
2294 }
2295 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2296#endif
2297 bool operator == (const EdgeInfo &Other) const {
2298 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2299 }
2300
2301 operator bool() const { return UserTE != nullptr; }
2302 };
2303 friend struct DenseMapInfo<EdgeInfo>;
2304
2305 /// A helper class used for scoring candidates for two consecutive lanes.
2307 const TargetLibraryInfo &TLI;
2308 const DataLayout &DL;
2309 ScalarEvolution &SE;
2310 const BoUpSLP &R;
2311 int NumLanes; // Total number of lanes (aka vectorization factor).
2312 int MaxLevel; // The maximum recursion depth for accumulating score.
2313
2314 public:
2316 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2317 int MaxLevel)
2318 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2319 MaxLevel(MaxLevel) {}
2320
2321 // The hard-coded scores listed here are not very important, though it shall
2322 // be higher for better matches to improve the resulting cost. When
2323 // computing the scores of matching one sub-tree with another, we are
2324 // basically counting the number of values that are matching. So even if all
2325 // scores are set to 1, we would still get a decent matching result.
2326 // However, sometimes we have to break ties. For example we may have to
2327 // choose between matching loads vs matching opcodes. This is what these
2328 // scores are helping us with: they provide the order of preference. Also,
2329 // this is important if the scalar is externally used or used in another
2330 // tree entry node in the different lane.
2331
2332 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2333 static const int ScoreConsecutiveLoads = 4;
2334 /// The same load multiple times. This should have a better score than
2335 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2336 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2337 /// a vector load and 1.0 for a broadcast.
2338 static const int ScoreSplatLoads = 3;
2339 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2340 static const int ScoreReversedLoads = 3;
2341 /// A load candidate for masked gather.
2342 static const int ScoreMaskedGatherCandidate = 1;
2343 /// ExtractElementInst from same vector and consecutive indexes.
2344 static const int ScoreConsecutiveExtracts = 4;
2345 /// ExtractElementInst from same vector and reversed indices.
2346 static const int ScoreReversedExtracts = 3;
2347 /// Constants.
2348 static const int ScoreConstants = 2;
2349 /// Instructions with the same opcode.
2350 static const int ScoreSameOpcode = 2;
2351 /// Instructions with alt opcodes (e.g, add + sub).
2352 static const int ScoreAltOpcodes = 1;
2353 /// Identical instructions (a.k.a. splat or broadcast).
2354 static const int ScoreSplat = 1;
2355 /// Matching with an undef is preferable to failing.
2356 static const int ScoreUndef = 1;
2357 /// Score for failing to find a decent match.
2358 static const int ScoreFail = 0;
2359 /// Score if all users are vectorized.
2360 static const int ScoreAllUserVectorized = 1;
2361
2362 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2363 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2364 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2365 /// MainAltOps.
2367 ArrayRef<Value *> MainAltOps) const {
2368 if (!isValidElementType(V1->getType()) ||
2371
2372 if (V1 == V2) {
2373 if (isa<LoadInst>(V1)) {
2374 // Retruns true if the users of V1 and V2 won't need to be extracted.
2375 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2376 // Bail out if we have too many uses to save compilation time.
2377 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2378 return false;
2379
2380 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2381 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2382 return U == U1 || U == U2 || R.isVectorized(U);
2383 });
2384 };
2385 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2386 };
2387 // A broadcast of a load can be cheaper on some targets.
2388 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2389 ElementCount::getFixed(NumLanes)) &&
2390 ((int)V1->getNumUses() == NumLanes ||
2391 AllUsersAreInternal(V1, V2)))
2393 }
2395 }
2396
2397 auto CheckSameEntryOrFail = [&]() {
2398 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2400 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2401 !TEs2.empty() &&
2402 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2404 }
2406 };
2407
2408 auto *LI1 = dyn_cast<LoadInst>(V1);
2409 auto *LI2 = dyn_cast<LoadInst>(V2);
2410 if (LI1 && LI2) {
2411 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2412 !LI2->isSimple())
2413 return CheckSameEntryOrFail();
2414
2415 std::optional<int64_t> Dist = getPointersDiff(
2416 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2417 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2418 if (!Dist || *Dist == 0) {
2419 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2420 getUnderlyingObject(LI2->getPointerOperand()) &&
2421 R.TTI->isLegalMaskedGather(
2422 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2424 return CheckSameEntryOrFail();
2425 }
2426 // The distance is too large - still may be profitable to use masked
2427 // loads/gathers.
2428 if (std::abs(*Dist) > NumLanes / 2)
2430 // This still will detect consecutive loads, but we might have "holes"
2431 // in some cases. It is ok for non-power-2 vectorization and may produce
2432 // better results. It should not affect current vectorization.
2435 }
2436
2437 auto *C1 = dyn_cast<Constant>(V1);
2438 auto *C2 = dyn_cast<Constant>(V2);
2439 if (C1 && C2)
2441
2442 // Consider constants and buildvector compatible.
2443 if ((C1 && isa<InsertElementInst>(V2)) ||
2444 (C2 && isa<InsertElementInst>(V1)))
2446
2447 // Extracts from consecutive indexes of the same vector better score as
2448 // the extracts could be optimized away.
2449 Value *EV1;
2450 ConstantInt *Ex1Idx;
2451 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2452 // Undefs are always profitable for extractelements.
2453 // Compiler can easily combine poison and extractelement <non-poison> or
2454 // undef and extractelement <poison>. But combining undef +
2455 // extractelement <non-poison-but-may-produce-poison> requires some
2456 // extra operations.
2457 if (isa<UndefValue>(V2))
2458 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2461 Value *EV2 = nullptr;
2462 ConstantInt *Ex2Idx = nullptr;
2463 if (match(V2,
2465 m_Undef())))) {
2466 // Undefs are always profitable for extractelements.
2467 if (!Ex2Idx)
2469 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2471 if (EV2 == EV1) {
2472 int Idx1 = Ex1Idx->getZExtValue();
2473 int Idx2 = Ex2Idx->getZExtValue();
2474 int Dist = Idx2 - Idx1;
2475 // The distance is too large - still may be profitable to use
2476 // shuffles.
2477 if (std::abs(Dist) == 0)
2479 if (std::abs(Dist) > NumLanes / 2)
2483 }
2485 }
2486 return CheckSameEntryOrFail();
2487 }
2488
2489 auto *I1 = dyn_cast<Instruction>(V1);
2490 auto *I2 = dyn_cast<Instruction>(V2);
2491 if (I1 && I2) {
2492 if (I1->getParent() != I2->getParent())
2493 return CheckSameEntryOrFail();
2494 SmallVector<Value *, 4> Ops(MainAltOps);
2495 Ops.push_back(I1);
2496 Ops.push_back(I2);
2497 InstructionsState S = getSameOpcode(Ops, TLI);
2498 // Note: Only consider instructions with <= 2 operands to avoid
2499 // complexity explosion.
2500 if (S &&
2501 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2502 !S.isAltShuffle()) &&
2503 all_of(Ops, [&S](Value *V) {
2504 return isa<PoisonValue>(V) ||
2505 cast<Instruction>(V)->getNumOperands() ==
2506 S.getMainOp()->getNumOperands();
2507 }))
2508 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2510 }
2511
2512 if (I1 && isa<PoisonValue>(V2))
2514
2515 if (isa<UndefValue>(V2))
2517
2518 return CheckSameEntryOrFail();
2519 }
2520
2521 /// Go through the operands of \p LHS and \p RHS recursively until
2522 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2523 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2524 /// of \p U1 and \p U2), except at the beginning of the recursion where
2525 /// these are set to nullptr.
2526 ///
2527 /// For example:
2528 /// \verbatim
2529 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2530 /// \ / \ / \ / \ /
2531 /// + + + +
2532 /// G1 G2 G3 G4
2533 /// \endverbatim
2534 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2535 /// each level recursively, accumulating the score. It starts from matching
2536 /// the additions at level 0, then moves on to the loads (level 1). The
2537 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2538 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2539 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2540 /// Please note that the order of the operands does not matter, as we
2541 /// evaluate the score of all profitable combinations of operands. In
2542 /// other words the score of G1 and G4 is the same as G1 and G2. This
2543 /// heuristic is based on ideas described in:
2544 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2545 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2546 /// Luís F. W. Góes
2548 Instruction *U2, int CurrLevel,
2549 ArrayRef<Value *> MainAltOps) const {
2550
2551 // Get the shallow score of V1 and V2.
2552 int ShallowScoreAtThisLevel =
2553 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2554
2555 // If reached MaxLevel,
2556 // or if V1 and V2 are not instructions,
2557 // or if they are SPLAT,
2558 // or if they are not consecutive,
2559 // or if profitable to vectorize loads or extractelements, early return
2560 // the current cost.
2561 auto *I1 = dyn_cast<Instruction>(LHS);
2562 auto *I2 = dyn_cast<Instruction>(RHS);
2563 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2564 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2565 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2566 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2568 ShallowScoreAtThisLevel))
2569 return ShallowScoreAtThisLevel;
2570 assert(I1 && I2 && "Should have early exited.");
2571
2572 // Contains the I2 operand indexes that got matched with I1 operands.
2573 SmallSet<unsigned, 4> Op2Used;
2574
2575 // Recursion towards the operands of I1 and I2. We are trying all possible
2576 // operand pairs, and keeping track of the best score.
2577 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2578 OpIdx1 != NumOperands1; ++OpIdx1) {
2579 // Try to pair op1I with the best operand of I2.
2580 int MaxTmpScore = 0;
2581 unsigned MaxOpIdx2 = 0;
2582 bool FoundBest = false;
2583 // If I2 is commutative try all combinations.
2584 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2585 unsigned ToIdx = isCommutative(I2)
2586 ? I2->getNumOperands()
2587 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2588 assert(FromIdx <= ToIdx && "Bad index");
2589 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2590 // Skip operands already paired with OpIdx1.
2591 if (Op2Used.count(OpIdx2))
2592 continue;
2593 // Recursively calculate the cost at each level
2594 int TmpScore =
2595 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2596 I1, I2, CurrLevel + 1, {});
2597 // Look for the best score.
2598 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2599 TmpScore > MaxTmpScore) {
2600 MaxTmpScore = TmpScore;
2601 MaxOpIdx2 = OpIdx2;
2602 FoundBest = true;
2603 }
2604 }
2605 if (FoundBest) {
2606 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2607 Op2Used.insert(MaxOpIdx2);
2608 ShallowScoreAtThisLevel += MaxTmpScore;
2609 }
2610 }
2611 return ShallowScoreAtThisLevel;
2612 }
2613 };
2614 /// A helper data structure to hold the operands of a vector of instructions.
2615 /// This supports a fixed vector length for all operand vectors.
2617 /// For each operand we need (i) the value, and (ii) the opcode that it
2618 /// would be attached to if the expression was in a left-linearized form.
2619 /// This is required to avoid illegal operand reordering.
2620 /// For example:
2621 /// \verbatim
2622 /// 0 Op1
2623 /// |/
2624 /// Op1 Op2 Linearized + Op2
2625 /// \ / ----------> |/
2626 /// - -
2627 ///
2628 /// Op1 - Op2 (0 + Op1) - Op2
2629 /// \endverbatim
2630 ///
2631 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2632 ///
2633 /// Another way to think of this is to track all the operations across the
2634 /// path from the operand all the way to the root of the tree and to
2635 /// calculate the operation that corresponds to this path. For example, the
2636 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2637 /// corresponding operation is a '-' (which matches the one in the
2638 /// linearized tree, as shown above).
2639 ///
2640 /// For lack of a better term, we refer to this operation as Accumulated
2641 /// Path Operation (APO).
2642 struct OperandData {
2643 OperandData() = default;
2644 OperandData(Value *V, bool APO, bool IsUsed)
2645 : V(V), APO(APO), IsUsed(IsUsed) {}
2646 /// The operand value.
2647 Value *V = nullptr;
2648 /// TreeEntries only allow a single opcode, or an alternate sequence of
2649 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2650 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2651 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2652 /// (e.g., Add/Mul)
2653 bool APO = false;
2654 /// Helper data for the reordering function.
2655 bool IsUsed = false;
2656 };
2657
2658 /// During operand reordering, we are trying to select the operand at lane
2659 /// that matches best with the operand at the neighboring lane. Our
2660 /// selection is based on the type of value we are looking for. For example,
2661 /// if the neighboring lane has a load, we need to look for a load that is
2662 /// accessing a consecutive address. These strategies are summarized in the
2663 /// 'ReorderingMode' enumerator.
2664 enum class ReorderingMode {
2665 Load, ///< Matching loads to consecutive memory addresses
2666 Opcode, ///< Matching instructions based on opcode (same or alternate)
2667 Constant, ///< Matching constants
2668 Splat, ///< Matching the same instruction multiple times (broadcast)
2669 Failed, ///< We failed to create a vectorizable group
2670 };
2671
2672 using OperandDataVec = SmallVector<OperandData, 2>;
2673
2674 /// A vector of operand vectors.
2676 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2677 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2678 unsigned ArgSize = 0;
2679
2680 const TargetLibraryInfo &TLI;
2681 const DataLayout &DL;
2682 ScalarEvolution &SE;
2683 const BoUpSLP &R;
2684 const Loop *L = nullptr;
2685
2686 /// \returns the operand data at \p OpIdx and \p Lane.
2687 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2688 return OpsVec[OpIdx][Lane];
2689 }
2690
2691 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2692 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2693 return OpsVec[OpIdx][Lane];
2694 }
2695
2696 /// Clears the used flag for all entries.
2697 void clearUsed() {
2698 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2699 OpIdx != NumOperands; ++OpIdx)
2700 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2701 ++Lane)
2702 OpsVec[OpIdx][Lane].IsUsed = false;
2703 }
2704
2705 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2706 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2707 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2708 }
2709
2710 /// \param Lane lane of the operands under analysis.
2711 /// \param OpIdx operand index in \p Lane lane we're looking the best
2712 /// candidate for.
2713 /// \param Idx operand index of the current candidate value.
2714 /// \returns The additional score due to possible broadcasting of the
2715 /// elements in the lane. It is more profitable to have power-of-2 unique
2716 /// elements in the lane, it will be vectorized with higher probability
2717 /// after removing duplicates. Currently the SLP vectorizer supports only
2718 /// vectorization of the power-of-2 number of unique scalars.
2719 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2720 const SmallBitVector &UsedLanes) const {
2721 Value *IdxLaneV = getData(Idx, Lane).V;
2722 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2723 isa<ExtractElementInst>(IdxLaneV))
2724 return 0;
2726 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2727 if (Ln == Lane)
2728 continue;
2729 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2730 if (!isa<Instruction>(OpIdxLnV))
2731 return 0;
2732 Uniques.try_emplace(OpIdxLnV, Ln);
2733 }
2734 unsigned UniquesCount = Uniques.size();
2735 auto IdxIt = Uniques.find(IdxLaneV);
2736 unsigned UniquesCntWithIdxLaneV =
2737 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2738 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2739 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2740 unsigned UniquesCntWithOpIdxLaneV =
2741 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2742 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2743 return 0;
2744 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2745 UniquesCntWithOpIdxLaneV,
2746 UniquesCntWithOpIdxLaneV -
2747 bit_floor(UniquesCntWithOpIdxLaneV)) -
2748 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2749 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2750 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2751 }
2752
2753 /// \param Lane lane of the operands under analysis.
2754 /// \param OpIdx operand index in \p Lane lane we're looking the best
2755 /// candidate for.
2756 /// \param Idx operand index of the current candidate value.
2757 /// \returns The additional score for the scalar which users are all
2758 /// vectorized.
2759 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2760 Value *IdxLaneV = getData(Idx, Lane).V;
2761 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2762 // Do not care about number of uses for vector-like instructions
2763 // (extractelement/extractvalue with constant indices), they are extracts
2764 // themselves and already externally used. Vectorization of such
2765 // instructions does not add extra extractelement instruction, just may
2766 // remove it.
2767 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2768 isVectorLikeInstWithConstOps(OpIdxLaneV))
2770 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2771 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2772 return 0;
2773 return R.areAllUsersVectorized(IdxLaneI)
2775 : 0;
2776 }
2777
2778 /// Score scaling factor for fully compatible instructions but with
2779 /// different number of external uses. Allows better selection of the
2780 /// instructions with less external uses.
2781 static const int ScoreScaleFactor = 10;
2782
2783 /// \Returns the look-ahead score, which tells us how much the sub-trees
2784 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2785 /// score. This helps break ties in an informed way when we cannot decide on
2786 /// the order of the operands by just considering the immediate
2787 /// predecessors.
2788 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2789 int Lane, unsigned OpIdx, unsigned Idx,
2790 bool &IsUsed, const SmallBitVector &UsedLanes) {
2791 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2793 // Keep track of the instruction stack as we recurse into the operands
2794 // during the look-ahead score exploration.
2795 int Score =
2796 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2797 /*CurrLevel=*/1, MainAltOps);
2798 if (Score) {
2799 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2800 if (Score <= -SplatScore) {
2801 // Failed score.
2802 Score = 0;
2803 } else {
2804 Score += SplatScore;
2805 // Scale score to see the difference between different operands
2806 // and similar operands but all vectorized/not all vectorized
2807 // uses. It does not affect actual selection of the best
2808 // compatible operand in general, just allows to select the
2809 // operand with all vectorized uses.
2810 Score *= ScoreScaleFactor;
2811 Score += getExternalUseScore(Lane, OpIdx, Idx);
2812 IsUsed = true;
2813 }
2814 }
2815 return Score;
2816 }
2817
2818 /// Best defined scores per lanes between the passes. Used to choose the
2819 /// best operand (with the highest score) between the passes.
2820 /// The key - {Operand Index, Lane}.
2821 /// The value - the best score between the passes for the lane and the
2822 /// operand.
2824 BestScoresPerLanes;
2825
2826 // Search all operands in Ops[*][Lane] for the one that matches best
2827 // Ops[OpIdx][LastLane] and return its opreand index.
2828 // If no good match can be found, return std::nullopt.
2829 std::optional<unsigned>
2830 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2831 ArrayRef<ReorderingMode> ReorderingModes,
2832 ArrayRef<Value *> MainAltOps,
2833 const SmallBitVector &UsedLanes) {
2834 unsigned NumOperands = getNumOperands();
2835
2836 // The operand of the previous lane at OpIdx.
2837 Value *OpLastLane = getData(OpIdx, LastLane).V;
2838
2839 // Our strategy mode for OpIdx.
2840 ReorderingMode RMode = ReorderingModes[OpIdx];
2841 if (RMode == ReorderingMode::Failed)
2842 return std::nullopt;
2843
2844 // The linearized opcode of the operand at OpIdx, Lane.
2845 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2846
2847 // The best operand index and its score.
2848 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2849 // are using the score to differentiate between the two.
2850 struct BestOpData {
2851 std::optional<unsigned> Idx;
2852 unsigned Score = 0;
2853 } BestOp;
2854 BestOp.Score =
2855 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2856 .first->second;
2857
2858 // Track if the operand must be marked as used. If the operand is set to
2859 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2860 // want to reestimate the operands again on the following iterations).
2861 bool IsUsed = RMode == ReorderingMode::Splat ||
2862 RMode == ReorderingMode::Constant ||
2863 RMode == ReorderingMode::Load;
2864 // Iterate through all unused operands and look for the best.
2865 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2866 // Get the operand at Idx and Lane.
2867 OperandData &OpData = getData(Idx, Lane);
2868 Value *Op = OpData.V;
2869 bool OpAPO = OpData.APO;
2870
2871 // Skip already selected operands.
2872 if (OpData.IsUsed)
2873 continue;
2874
2875 // Skip if we are trying to move the operand to a position with a
2876 // different opcode in the linearized tree form. This would break the
2877 // semantics.
2878 if (OpAPO != OpIdxAPO)
2879 continue;
2880
2881 // Look for an operand that matches the current mode.
2882 switch (RMode) {
2883 case ReorderingMode::Load:
2884 case ReorderingMode::Opcode: {
2885 bool LeftToRight = Lane > LastLane;
2886 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2887 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2888 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2889 OpIdx, Idx, IsUsed, UsedLanes);
2890 if (Score > static_cast<int>(BestOp.Score) ||
2891 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2892 Idx == OpIdx)) {
2893 BestOp.Idx = Idx;
2894 BestOp.Score = Score;
2895 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2896 }
2897 break;
2898 }
2899 case ReorderingMode::Constant:
2900 if (isa<Constant>(Op) ||
2901 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2902 BestOp.Idx = Idx;
2903 if (isa<Constant>(Op)) {
2905 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2907 }
2909 IsUsed = false;
2910 }
2911 break;
2912 case ReorderingMode::Splat:
2913 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2914 IsUsed = Op == OpLastLane;
2915 if (Op == OpLastLane) {
2916 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2917 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2919 }
2920 BestOp.Idx = Idx;
2921 }
2922 break;
2923 case ReorderingMode::Failed:
2924 llvm_unreachable("Not expected Failed reordering mode.");
2925 }
2926 }
2927
2928 if (BestOp.Idx) {
2929 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2930 return BestOp.Idx;
2931 }
2932 // If we could not find a good match return std::nullopt.
2933 return std::nullopt;
2934 }
2935
2936 /// Helper for reorderOperandVecs.
2937 /// \returns the lane that we should start reordering from. This is the one
2938 /// which has the least number of operands that can freely move about or
2939 /// less profitable because it already has the most optimal set of operands.
2940 unsigned getBestLaneToStartReordering() const {
2941 unsigned Min = UINT_MAX;
2942 unsigned SameOpNumber = 0;
2943 // std::pair<unsigned, unsigned> is used to implement a simple voting
2944 // algorithm and choose the lane with the least number of operands that
2945 // can freely move about or less profitable because it already has the
2946 // most optimal set of operands. The first unsigned is a counter for
2947 // voting, the second unsigned is the counter of lanes with instructions
2948 // with same/alternate opcodes and same parent basic block.
2950 // Try to be closer to the original results, if we have multiple lanes
2951 // with same cost. If 2 lanes have the same cost, use the one with the
2952 // highest index.
2953 for (int I = getNumLanes(); I > 0; --I) {
2954 unsigned Lane = I - 1;
2955 OperandsOrderData NumFreeOpsHash =
2956 getMaxNumOperandsThatCanBeReordered(Lane);
2957 // Compare the number of operands that can move and choose the one with
2958 // the least number.
2959 if (NumFreeOpsHash.NumOfAPOs < Min) {
2960 Min = NumFreeOpsHash.NumOfAPOs;
2961 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2962 HashMap.clear();
2963 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2964 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2965 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2966 // Select the most optimal lane in terms of number of operands that
2967 // should be moved around.
2968 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2969 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2970 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2971 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2972 auto [It, Inserted] =
2973 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2974 if (!Inserted)
2975 ++It->second.first;
2976 }
2977 }
2978 // Select the lane with the minimum counter.
2979 unsigned BestLane = 0;
2980 unsigned CntMin = UINT_MAX;
2981 for (const auto &Data : reverse(HashMap)) {
2982 if (Data.second.first < CntMin) {
2983 CntMin = Data.second.first;
2984 BestLane = Data.second.second;
2985 }
2986 }
2987 return BestLane;
2988 }
2989
2990 /// Data structure that helps to reorder operands.
2991 struct OperandsOrderData {
2992 /// The best number of operands with the same APOs, which can be
2993 /// reordered.
2994 unsigned NumOfAPOs = UINT_MAX;
2995 /// Number of operands with the same/alternate instruction opcode and
2996 /// parent.
2997 unsigned NumOpsWithSameOpcodeParent = 0;
2998 /// Hash for the actual operands ordering.
2999 /// Used to count operands, actually their position id and opcode
3000 /// value. It is used in the voting mechanism to find the lane with the
3001 /// least number of operands that can freely move about or less profitable
3002 /// because it already has the most optimal set of operands. Can be
3003 /// replaced with SmallVector<unsigned> instead but hash code is faster
3004 /// and requires less memory.
3005 unsigned Hash = 0;
3006 };
3007 /// \returns the maximum number of operands that are allowed to be reordered
3008 /// for \p Lane and the number of compatible instructions(with the same
3009 /// parent/opcode). This is used as a heuristic for selecting the first lane
3010 /// to start operand reordering.
3011 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3012 unsigned CntTrue = 0;
3013 unsigned NumOperands = getNumOperands();
3014 // Operands with the same APO can be reordered. We therefore need to count
3015 // how many of them we have for each APO, like this: Cnt[APO] = x.
3016 // Since we only have two APOs, namely true and false, we can avoid using
3017 // a map. Instead we can simply count the number of operands that
3018 // correspond to one of them (in this case the 'true' APO), and calculate
3019 // the other by subtracting it from the total number of operands.
3020 // Operands with the same instruction opcode and parent are more
3021 // profitable since we don't need to move them in many cases, with a high
3022 // probability such lane already can be vectorized effectively.
3023 bool AllUndefs = true;
3024 unsigned NumOpsWithSameOpcodeParent = 0;
3025 Instruction *OpcodeI = nullptr;
3026 BasicBlock *Parent = nullptr;
3027 unsigned Hash = 0;
3028 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3029 const OperandData &OpData = getData(OpIdx, Lane);
3030 if (OpData.APO)
3031 ++CntTrue;
3032 // Use Boyer-Moore majority voting for finding the majority opcode and
3033 // the number of times it occurs.
3034 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3035 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3036 I->getParent() != Parent) {
3037 if (NumOpsWithSameOpcodeParent == 0) {
3038 NumOpsWithSameOpcodeParent = 1;
3039 OpcodeI = I;
3040 Parent = I->getParent();
3041 } else {
3042 --NumOpsWithSameOpcodeParent;
3043 }
3044 } else {
3045 ++NumOpsWithSameOpcodeParent;
3046 }
3047 }
3048 Hash = hash_combine(
3049 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3050 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3051 }
3052 if (AllUndefs)
3053 return {};
3054 OperandsOrderData Data;
3055 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3056 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3057 Data.Hash = Hash;
3058 return Data;
3059 }
3060
3061 /// Go through the instructions in VL and append their operands.
3062 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3063 const InstructionsState &S) {
3064 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3065 assert((empty() || all_of(Operands,
3066 [this](const ValueList &VL) {
3067 return VL.size() == getNumLanes();
3068 })) &&
3069 "Expected same number of lanes");
3070 assert(S.valid() && "InstructionsState is invalid.");
3071 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3072 // arguments to the intrinsic produces the same result.
3073 Instruction *MainOp = S.getMainOp();
3074 unsigned NumOperands = MainOp->getNumOperands();
3076 OpsVec.resize(ArgSize);
3077 unsigned NumLanes = VL.size();
3078 for (OperandDataVec &Ops : OpsVec)
3079 Ops.resize(NumLanes);
3080 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3081 // Our tree has just 3 nodes: the root and two operands.
3082 // It is therefore trivial to get the APO. We only need to check the
3083 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3084 // operand. The LHS operand of both add and sub is never attached to an
3085 // inversese operation in the linearized form, therefore its APO is
3086 // false. The RHS is true only if V is an inverse operation.
3087
3088 // Since operand reordering is performed on groups of commutative
3089 // operations or alternating sequences (e.g., +, -), we can safely tell
3090 // the inverse operations by checking commutativity.
3091 auto *I = dyn_cast<Instruction>(VL[Lane]);
3092 if (!I && isa<PoisonValue>(VL[Lane])) {
3093 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3094 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3095 continue;
3096 }
3097 bool IsInverseOperation = false;
3098 if (S.isCopyableElement(VL[Lane])) {
3099 // The value is a copyable element.
3100 IsInverseOperation = !isCommutative(MainOp, VL[Lane]);
3101 } else {
3102 assert(I && "Expected instruction");
3103 auto [SelectedOp, Ops] = convertTo(I, S);
3104 // We cannot check commutativity by the converted instruction
3105 // (SelectedOp) because isCommutative also examines def-use
3106 // relationships.
3107 IsInverseOperation = !isCommutative(SelectedOp, I);
3108 }
3109 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3110 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3111 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3112 }
3113 }
3114 }
3115
3116 /// \returns the number of operands.
3117 unsigned getNumOperands() const { return ArgSize; }
3118
3119 /// \returns the number of lanes.
3120 unsigned getNumLanes() const { return OpsVec[0].size(); }
3121
3122 /// \returns the operand value at \p OpIdx and \p Lane.
3123 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3124 return getData(OpIdx, Lane).V;
3125 }
3126
3127 /// \returns true if the data structure is empty.
3128 bool empty() const { return OpsVec.empty(); }
3129
3130 /// Clears the data.
3131 void clear() { OpsVec.clear(); }
3132
3133 /// \Returns true if there are enough operands identical to \p Op to fill
3134 /// the whole vector (it is mixed with constants or loop invariant values).
3135 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3136 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3137 assert(Op == getValue(OpIdx, Lane) &&
3138 "Op is expected to be getValue(OpIdx, Lane).");
3139 // Small number of loads - try load matching.
3140 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3141 return false;
3142 bool OpAPO = getData(OpIdx, Lane).APO;
3143 bool IsInvariant = L && L->isLoopInvariant(Op);
3144 unsigned Cnt = 0;
3145 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3146 if (Ln == Lane)
3147 continue;
3148 // This is set to true if we found a candidate for broadcast at Lane.
3149 bool FoundCandidate = false;
3150 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3151 OperandData &Data = getData(OpI, Ln);
3152 if (Data.APO != OpAPO || Data.IsUsed)
3153 continue;
3154 Value *OpILane = getValue(OpI, Lane);
3155 bool IsConstantOp = isa<Constant>(OpILane);
3156 // Consider the broadcast candidate if:
3157 // 1. Same value is found in one of the operands.
3158 if (Data.V == Op ||
3159 // 2. The operand in the given lane is not constant but there is a
3160 // constant operand in another lane (which can be moved to the
3161 // given lane). In this case we can represent it as a simple
3162 // permutation of constant and broadcast.
3163 (!IsConstantOp &&
3164 ((Lns > 2 && isa<Constant>(Data.V)) ||
3165 // 2.1. If we have only 2 lanes, need to check that value in the
3166 // next lane does not build same opcode sequence.
3167 (Lns == 2 &&
3168 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3169 isa<Constant>(Data.V)))) ||
3170 // 3. The operand in the current lane is loop invariant (can be
3171 // hoisted out) and another operand is also a loop invariant
3172 // (though not a constant). In this case the whole vector can be
3173 // hoisted out.
3174 // FIXME: need to teach the cost model about this case for better
3175 // estimation.
3176 (IsInvariant && !isa<Constant>(Data.V) &&
3177 !getSameOpcode({Op, Data.V}, TLI) &&
3178 L->isLoopInvariant(Data.V))) {
3179 FoundCandidate = true;
3180 Data.IsUsed = Data.V == Op;
3181 if (Data.V == Op)
3182 ++Cnt;
3183 break;
3184 }
3185 }
3186 if (!FoundCandidate)
3187 return false;
3188 }
3189 return getNumLanes() == 2 || Cnt > 1;
3190 }
3191
3192 /// Checks if there is at least single compatible operand in lanes other
3193 /// than \p Lane, compatible with the operand \p Op.
3194 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3195 assert(Op == getValue(OpIdx, Lane) &&
3196 "Op is expected to be getValue(OpIdx, Lane).");
3197 bool OpAPO = getData(OpIdx, Lane).APO;
3198 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3199 if (Ln == Lane)
3200 continue;
3201 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3202 const OperandData &Data = getData(OpI, Ln);
3203 if (Data.APO != OpAPO || Data.IsUsed)
3204 return true;
3205 Value *OpILn = getValue(OpI, Ln);
3206 return (L && L->isLoopInvariant(OpILn)) ||
3207 (getSameOpcode({Op, OpILn}, TLI) &&
3208 allSameBlock({Op, OpILn}));
3209 }))
3210 return true;
3211 }
3212 return false;
3213 }
3214
3215 public:
3216 /// Initialize with all the operands of the instruction vector \p RootVL.
3218 const InstructionsState &S, const BoUpSLP &R)
3219 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3220 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3221 // Append all the operands of RootVL.
3222 appendOperands(RootVL, Operands, S);
3223 }
3224
3225 /// \Returns a value vector with the operands across all lanes for the
3226 /// opearnd at \p OpIdx.
3227 ValueList getVL(unsigned OpIdx) const {
3228 ValueList OpVL(OpsVec[OpIdx].size());
3229 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3230 "Expected same num of lanes across all operands");
3231 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3232 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3233 return OpVL;
3234 }
3235
3236 // Performs operand reordering for 2 or more operands.
3237 // The original operands are in OrigOps[OpIdx][Lane].
3238 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3239 void reorder() {
3240 unsigned NumOperands = getNumOperands();
3241 unsigned NumLanes = getNumLanes();
3242 // Each operand has its own mode. We are using this mode to help us select
3243 // the instructions for each lane, so that they match best with the ones
3244 // we have selected so far.
3245 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3246
3247 // This is a greedy single-pass algorithm. We are going over each lane
3248 // once and deciding on the best order right away with no back-tracking.
3249 // However, in order to increase its effectiveness, we start with the lane
3250 // that has operands that can move the least. For example, given the
3251 // following lanes:
3252 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3253 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3254 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3255 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3256 // we will start at Lane 1, since the operands of the subtraction cannot
3257 // be reordered. Then we will visit the rest of the lanes in a circular
3258 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3259
3260 // Find the first lane that we will start our search from.
3261 unsigned FirstLane = getBestLaneToStartReordering();
3262
3263 // Initialize the modes.
3264 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3265 Value *OpLane0 = getValue(OpIdx, FirstLane);
3266 // Keep track if we have instructions with all the same opcode on one
3267 // side.
3268 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3269 // Check if OpLane0 should be broadcast.
3270 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3271 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3272 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3273 else if (isa<LoadInst>(OpILane0))
3274 ReorderingModes[OpIdx] = ReorderingMode::Load;
3275 else
3276 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3277 } else if (isa<Constant>(OpLane0)) {
3278 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3279 } else if (isa<Argument>(OpLane0)) {
3280 // Our best hope is a Splat. It may save some cost in some cases.
3281 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3282 } else {
3283 llvm_unreachable("Unexpected value kind.");
3284 }
3285 }
3286
3287 // Check that we don't have same operands. No need to reorder if operands
3288 // are just perfect diamond or shuffled diamond match. Do not do it only
3289 // for possible broadcasts or non-power of 2 number of scalars (just for
3290 // now).
3291 auto &&SkipReordering = [this]() {
3292 SmallPtrSet<Value *, 4> UniqueValues;
3293 ArrayRef<OperandData> Op0 = OpsVec.front();
3294 for (const OperandData &Data : Op0)
3295 UniqueValues.insert(Data.V);
3297 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3298 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3299 return !UniqueValues.contains(Data.V);
3300 }))
3301 return false;
3302 }
3303 // TODO: Check if we can remove a check for non-power-2 number of
3304 // scalars after full support of non-power-2 vectorization.
3305 return UniqueValues.size() != 2 &&
3306 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3307 UniqueValues.size());
3308 };
3309
3310 // If the initial strategy fails for any of the operand indexes, then we
3311 // perform reordering again in a second pass. This helps avoid assigning
3312 // high priority to the failed strategy, and should improve reordering for
3313 // the non-failed operand indexes.
3314 for (int Pass = 0; Pass != 2; ++Pass) {
3315 // Check if no need to reorder operands since they're are perfect or
3316 // shuffled diamond match.
3317 // Need to do it to avoid extra external use cost counting for
3318 // shuffled matches, which may cause regressions.
3319 if (SkipReordering())
3320 break;
3321 // Skip the second pass if the first pass did not fail.
3322 bool StrategyFailed = false;
3323 // Mark all operand data as free to use.
3324 clearUsed();
3325 // We keep the original operand order for the FirstLane, so reorder the
3326 // rest of the lanes. We are visiting the nodes in a circular fashion,
3327 // using FirstLane as the center point and increasing the radius
3328 // distance.
3329 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3330 for (unsigned I = 0; I < NumOperands; ++I)
3331 MainAltOps[I].push_back(getData(I, FirstLane).V);
3332
3333 SmallBitVector UsedLanes(NumLanes);
3334 UsedLanes.set(FirstLane);
3335 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3336 // Visit the lane on the right and then the lane on the left.
3337 for (int Direction : {+1, -1}) {
3338 int Lane = FirstLane + Direction * Distance;
3339 if (Lane < 0 || Lane >= (int)NumLanes)
3340 continue;
3341 UsedLanes.set(Lane);
3342 int LastLane = Lane - Direction;
3343 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3344 "Out of bounds");
3345 // Look for a good match for each operand.
3346 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3347 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3348 std::optional<unsigned> BestIdx =
3349 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3350 MainAltOps[OpIdx], UsedLanes);
3351 // By not selecting a value, we allow the operands that follow to
3352 // select a better matching value. We will get a non-null value in
3353 // the next run of getBestOperand().
3354 if (BestIdx) {
3355 // Swap the current operand with the one returned by
3356 // getBestOperand().
3357 swap(OpIdx, *BestIdx, Lane);
3358 } else {
3359 // Enable the second pass.
3360 StrategyFailed = true;
3361 }
3362 // Try to get the alternate opcode and follow it during analysis.
3363 if (MainAltOps[OpIdx].size() != 2) {
3364 OperandData &AltOp = getData(OpIdx, Lane);
3365 InstructionsState OpS =
3366 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3367 if (OpS && OpS.isAltShuffle())
3368 MainAltOps[OpIdx].push_back(AltOp.V);
3369 }
3370 }
3371 }
3372 }
3373 // Skip second pass if the strategy did not fail.
3374 if (!StrategyFailed)
3375 break;
3376 }
3377 }
3378
3379#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3380 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3381 switch (RMode) {
3382 case ReorderingMode::Load:
3383 return "Load";
3384 case ReorderingMode::Opcode:
3385 return "Opcode";
3386 case ReorderingMode::Constant:
3387 return "Constant";
3388 case ReorderingMode::Splat:
3389 return "Splat";
3390 case ReorderingMode::Failed:
3391 return "Failed";
3392 }
3393 llvm_unreachable("Unimplemented Reordering Type");
3394 }
3395
3396 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3397 raw_ostream &OS) {
3398 return OS << getModeStr(RMode);
3399 }
3400
3401 /// Debug print.
3402 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3403 printMode(RMode, dbgs());
3404 }
3405
3406 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3407 return printMode(RMode, OS);
3408 }
3409
3411 const unsigned Indent = 2;
3412 unsigned Cnt = 0;
3413 for (const OperandDataVec &OpDataVec : OpsVec) {
3414 OS << "Operand " << Cnt++ << "\n";
3415 for (const OperandData &OpData : OpDataVec) {
3416 OS.indent(Indent) << "{";
3417 if (Value *V = OpData.V)
3418 OS << *V;
3419 else
3420 OS << "null";
3421 OS << ", APO:" << OpData.APO << "}\n";
3422 }
3423 OS << "\n";
3424 }
3425 return OS;
3426 }
3427
3428 /// Debug print.
3429 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3430#endif
3431 };
3432
3433 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3434 /// for a pair which have highest score deemed to have best chance to form
3435 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3436 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3437 /// of the cost, considered to be good enough score.
3438 std::optional<int>
3439 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3440 int Limit = LookAheadHeuristics::ScoreFail) const {
3441 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3443 int BestScore = Limit;
3444 std::optional<int> Index;
3445 for (int I : seq<int>(0, Candidates.size())) {
3446 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3447 Candidates[I].second,
3448 /*U1=*/nullptr, /*U2=*/nullptr,
3449 /*CurrLevel=*/1, {});
3450 if (Score > BestScore) {
3451 BestScore = Score;
3452 Index = I;
3453 }
3454 }
3455 return Index;
3456 }
3457
3458 /// Checks if the instruction is marked for deletion.
3459 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3460
3461 /// Removes an instruction from its block and eventually deletes it.
3462 /// It's like Instruction::eraseFromParent() except that the actual deletion
3463 /// is delayed until BoUpSLP is destructed.
3465 DeletedInstructions.insert(I);
3466 }
3467
3468 /// Remove instructions from the parent function and clear the operands of \p
3469 /// DeadVals instructions, marking for deletion trivially dead operands.
3470 template <typename T>
3472 ArrayRef<T *> DeadVals,
3473 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3475 for (T *V : DeadVals) {
3476 auto *I = cast<Instruction>(V);
3478 }
3479 DenseSet<Value *> Processed;
3480 for (T *V : DeadVals) {
3481 if (!V || !Processed.insert(V).second)
3482 continue;
3483 auto *I = cast<Instruction>(V);
3485 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3486 for (Use &U : I->operands()) {
3487 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3488 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3490 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3491 return Entry->VectorizedValue == OpI;
3492 })))
3493 DeadInsts.push_back(OpI);
3494 }
3495 I->dropAllReferences();
3496 }
3497 for (T *V : DeadVals) {
3498 auto *I = cast<Instruction>(V);
3499 if (!I->getParent())
3500 continue;
3501 assert((I->use_empty() || all_of(I->uses(),
3502 [&](Use &U) {
3503 return isDeleted(
3504 cast<Instruction>(U.getUser()));
3505 })) &&
3506 "trying to erase instruction with users.");
3507 I->removeFromParent();
3508 SE->forgetValue(I);
3509 }
3510 // Process the dead instruction list until empty.
3511 while (!DeadInsts.empty()) {
3512 Value *V = DeadInsts.pop_back_val();
3514 if (!VI || !VI->getParent())
3515 continue;
3517 "Live instruction found in dead worklist!");
3518 assert(VI->use_empty() && "Instructions with uses are not dead.");
3519
3520 // Don't lose the debug info while deleting the instructions.
3521 salvageDebugInfo(*VI);
3522
3523 // Null out all of the instruction's operands to see if any operand
3524 // becomes dead as we go.
3525 for (Use &OpU : VI->operands()) {
3526 Value *OpV = OpU.get();
3527 if (!OpV)
3528 continue;
3529 OpU.set(nullptr);
3530
3531 if (!OpV->use_empty())
3532 continue;
3533
3534 // If the operand is an instruction that became dead as we nulled out
3535 // the operand, and if it is 'trivially' dead, delete it in a future
3536 // loop iteration.
3537 if (auto *OpI = dyn_cast<Instruction>(OpV))
3538 if (!DeletedInstructions.contains(OpI) &&
3539 (!OpI->getType()->isVectorTy() ||
3540 none_of(VectorValuesAndScales,
3541 [&](const std::tuple<Value *, unsigned, bool> &V) {
3542 return std::get<0>(V) == OpI;
3543 })) &&
3545 DeadInsts.push_back(OpI);
3546 }
3547
3548 VI->removeFromParent();
3549 eraseInstruction(VI);
3550 SE->forgetValue(VI);
3551 }
3552 }
3553
3554 /// Checks if the instruction was already analyzed for being possible
3555 /// reduction root.
3557 return AnalyzedReductionsRoots.count(I);
3558 }
3559 /// Register given instruction as already analyzed for being possible
3560 /// reduction root.
3562 AnalyzedReductionsRoots.insert(I);
3563 }
3564 /// Checks if the provided list of reduced values was checked already for
3565 /// vectorization.
3567 return AnalyzedReductionVals.contains(hash_value(VL));
3568 }
3569 /// Adds the list of reduced values to list of already checked values for the
3570 /// vectorization.
3572 AnalyzedReductionVals.insert(hash_value(VL));
3573 }
3574 /// Clear the list of the analyzed reduction root instructions.
3576 AnalyzedReductionsRoots.clear();
3577 AnalyzedReductionVals.clear();
3578 AnalyzedMinBWVals.clear();
3579 }
3580 /// Checks if the given value is gathered in one of the nodes.
3581 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3582 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3583 }
3584 /// Checks if the given value is gathered in one of the nodes.
3585 bool isGathered(const Value *V) const {
3586 return MustGather.contains(V);
3587 }
3588 /// Checks if the specified value was not schedule.
3589 bool isNotScheduled(const Value *V) const {
3590 return NonScheduledFirst.contains(V);
3591 }
3592
3593 /// Check if the value is vectorized in the tree.
3594 bool isVectorized(const Value *V) const {
3595 assert(V && "V cannot be nullptr.");
3596 return ScalarToTreeEntries.contains(V);
3597 }
3598
3599 ~BoUpSLP();
3600
3601private:
3602 /// Determine if a node \p E in can be demoted to a smaller type with a
3603 /// truncation. We collect the entries that will be demoted in ToDemote.
3604 /// \param E Node for analysis
3605 /// \param ToDemote indices of the nodes to be demoted.
3606 bool collectValuesToDemote(
3607 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3609 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3610 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3611
3612 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3613 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3614 /// they have only one user and reordarable).
3615 /// \param ReorderableGathers List of all gather nodes that require reordering
3616 /// (e.g., gather of extractlements or partially vectorizable loads).
3617 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3618 /// reordering, subset of \p NonVectorized.
3619 void buildReorderableOperands(
3620 TreeEntry *UserTE,
3621 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3622 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3623 SmallVectorImpl<TreeEntry *> &GatherOps);
3624
3625 /// Checks if the given \p TE is a gather node with clustered reused scalars
3626 /// and reorders it per given \p Mask.
3627 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3628
3629 /// Checks if all users of \p I are the part of the vectorization tree.
3630 bool areAllUsersVectorized(
3631 Instruction *I,
3632 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3633
3634 /// Return information about the vector formed for the specified index
3635 /// of a vector of (the same) instruction.
3637
3638 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3639 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3640 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3641 return const_cast<TreeEntry *>(
3642 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3643 }
3644
3645 /// Gets the root instruction for the given node. If the node is a strided
3646 /// load/store node with the reverse order, the root instruction is the last
3647 /// one.
3648 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3649
3650 /// \returns Cast context for the given graph node.
3652 getCastContextHint(const TreeEntry &TE) const;
3653
3654 /// \returns the cost of the vectorizable entry.
3655 InstructionCost getEntryCost(const TreeEntry *E,
3656 ArrayRef<Value *> VectorizedVals,
3657 SmallPtrSetImpl<Value *> &CheckedExtracts);
3658
3659 /// Checks if it is legal and profitable to build SplitVectorize node for the
3660 /// given \p VL.
3661 /// \param Op1 first homogeneous scalars.
3662 /// \param Op2 second homogeneous scalars.
3663 /// \param ReorderIndices indices to reorder the scalars.
3664 /// \returns true if the node was successfully built.
3665 bool canBuildSplitNode(ArrayRef<Value *> VL,
3666 const InstructionsState &LocalState,
3669 OrdersType &ReorderIndices) const;
3670
3671 /// This is the recursive part of buildTree.
3672 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3673 unsigned InterleaveFactor = 0);
3674
3675 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3676 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3677 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3678 /// returns false, setting \p CurrentOrder to either an empty vector or a
3679 /// non-identity permutation that allows to reuse extract instructions.
3680 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3681 /// extract order.
3682 bool canReuseExtract(ArrayRef<Value *> VL,
3683 SmallVectorImpl<unsigned> &CurrentOrder,
3684 bool ResizeAllowed = false) const;
3685
3686 /// Vectorize a single entry in the tree.
3687 Value *vectorizeTree(TreeEntry *E);
3688
3689 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3690 /// \p E.
3691 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3692
3693 /// Create a new vector from a list of scalar values. Produces a sequence
3694 /// which exploits values reused across lanes, and arranges the inserts
3695 /// for ease of later optimization.
3696 template <typename BVTy, typename ResTy, typename... Args>
3697 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3698
3699 /// Create a new vector from a list of scalar values. Produces a sequence
3700 /// which exploits values reused across lanes, and arranges the inserts
3701 /// for ease of later optimization.
3702 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3703
3704 /// Returns the instruction in the bundle, which can be used as a base point
3705 /// for scheduling. Usually it is the last instruction in the bundle, except
3706 /// for the case when all operands are external (in this case, it is the first
3707 /// instruction in the list).
3708 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3709
3710 /// Tries to find extractelement instructions with constant indices from fixed
3711 /// vector type and gather such instructions into a bunch, which highly likely
3712 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3713 /// was successful, the matched scalars are replaced by poison values in \p VL
3714 /// for future analysis.
3715 std::optional<TargetTransformInfo::ShuffleKind>
3716 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3717 SmallVectorImpl<int> &Mask) const;
3718
3719 /// Tries to find extractelement instructions with constant indices from fixed
3720 /// vector type and gather such instructions into a bunch, which highly likely
3721 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3722 /// was successful, the matched scalars are replaced by poison values in \p VL
3723 /// for future analysis.
3725 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3727 unsigned NumParts) const;
3728
3729 /// Checks if the gathered \p VL can be represented as a single register
3730 /// shuffle(s) of previous tree entries.
3731 /// \param TE Tree entry checked for permutation.
3732 /// \param VL List of scalars (a subset of the TE scalar), checked for
3733 /// permutations. Must form single-register vector.
3734 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3735 /// commands to build the mask using the original vector value, without
3736 /// relying on the potential reordering.
3737 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3738 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3739 std::optional<TargetTransformInfo::ShuffleKind>
3740 isGatherShuffledSingleRegisterEntry(
3741 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3742 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3743 bool ForOrder);
3744
3745 /// Checks if the gathered \p VL can be represented as multi-register
3746 /// shuffle(s) of previous tree entries.
3747 /// \param TE Tree entry checked for permutation.
3748 /// \param VL List of scalars (a subset of the TE scalar), checked for
3749 /// permutations.
3750 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3751 /// commands to build the mask using the original vector value, without
3752 /// relying on the potential reordering.
3753 /// \returns per-register series of ShuffleKind, if gathered values can be
3754 /// represented as shuffles of previous tree entries. \p Mask is filled with
3755 /// the shuffle mask (also on per-register base).
3757 isGatherShuffledEntry(
3758 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3760 unsigned NumParts, bool ForOrder = false);
3761
3762 /// \returns the cost of gathering (inserting) the values in \p VL into a
3763 /// vector.
3764 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3765 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3766 Type *ScalarTy) const;
3767
3768 /// Set the Builder insert point to one after the last instruction in
3769 /// the bundle
3770 void setInsertPointAfterBundle(const TreeEntry *E);
3771
3772 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3773 /// specified, the starting vector value is poison.
3774 Value *
3775 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3776 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3777
3778 /// \returns whether the VectorizableTree is fully vectorizable and will
3779 /// be beneficial even the tree height is tiny.
3780 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3781
3782 /// Run through the list of all gathered loads in the graph and try to find
3783 /// vector loads/masked gathers instead of regular gathers. Later these loads
3784 /// are reshufled to build final gathered nodes.
3785 void tryToVectorizeGatheredLoads(
3786 const SmallMapVector<
3787 std::tuple<BasicBlock *, Value *, Type *>,
3788 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3789 &GatheredLoads);
3790
3791 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3792 /// users of \p TE and collects the stores. It returns the map from the store
3793 /// pointers to the collected stores.
3795 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3796
3797 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3798 /// stores in \p StoresVec can form a vector instruction. If so it returns
3799 /// true and populates \p ReorderIndices with the shuffle indices of the
3800 /// stores when compared to the sorted vector.
3801 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3802 OrdersType &ReorderIndices) const;
3803
3804 /// Iterates through the users of \p TE, looking for scalar stores that can be
3805 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3806 /// their order and builds an order index vector for each store bundle. It
3807 /// returns all these order vectors found.
3808 /// We run this after the tree has formed, otherwise we may come across user
3809 /// instructions that are not yet in the tree.
3811 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3812
3813 /// Tries to reorder the gathering node for better vectorization
3814 /// opportunities.
3815 void reorderGatherNode(TreeEntry &TE);
3816
3817 class TreeEntry {
3818 public:
3819 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3820 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3821
3822 /// \returns Common mask for reorder indices and reused scalars.
3823 SmallVector<int> getCommonMask() const {
3824 if (State == TreeEntry::SplitVectorize)
3825 return {};
3826 SmallVector<int> Mask;
3827 inversePermutation(ReorderIndices, Mask);
3828 ::addMask(Mask, ReuseShuffleIndices);
3829 return Mask;
3830 }
3831
3832 /// \returns The mask for split nodes.
3833 SmallVector<int> getSplitMask() const {
3834 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3835 "Expected only split vectorize node.");
3836 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3837 unsigned CommonVF = std::max<unsigned>(
3838 CombinedEntriesWithIndices.back().second,
3839 Scalars.size() - CombinedEntriesWithIndices.back().second);
3840 for (auto [Idx, I] : enumerate(ReorderIndices))
3841 Mask[I] =
3842 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3843 ? CommonVF - CombinedEntriesWithIndices.back().second
3844 : 0);
3845 return Mask;
3846 }
3847
3848 /// Updates (reorders) SplitVectorize node according to the given mask \p
3849 /// Mask and order \p MaskOrder.
3850 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3851 ArrayRef<int> MaskOrder);
3852
3853 /// \returns true if the scalars in VL are equal to this entry.
3854 bool isSame(ArrayRef<Value *> VL) const {
3855 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3856 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3857 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3858 return VL.size() == Mask.size() &&
3859 std::equal(VL.begin(), VL.end(), Mask.begin(),
3860 [Scalars](Value *V, int Idx) {
3861 return (isa<UndefValue>(V) &&
3862 Idx == PoisonMaskElem) ||
3863 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3864 });
3865 };
3866 if (!ReorderIndices.empty()) {
3867 // TODO: implement matching if the nodes are just reordered, still can
3868 // treat the vector as the same if the list of scalars matches VL
3869 // directly, without reordering.
3870 SmallVector<int> Mask;
3871 inversePermutation(ReorderIndices, Mask);
3872 if (VL.size() == Scalars.size())
3873 return IsSame(Scalars, Mask);
3874 if (VL.size() == ReuseShuffleIndices.size()) {
3875 ::addMask(Mask, ReuseShuffleIndices);
3876 return IsSame(Scalars, Mask);
3877 }
3878 return false;
3879 }
3880 return IsSame(Scalars, ReuseShuffleIndices);
3881 }
3882
3883 /// \returns true if current entry has same operands as \p TE.
3884 bool hasEqualOperands(const TreeEntry &TE) const {
3885 if (TE.getNumOperands() != getNumOperands())
3886 return false;
3887 SmallBitVector Used(getNumOperands());
3888 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3889 unsigned PrevCount = Used.count();
3890 for (unsigned K = 0; K < E; ++K) {
3891 if (Used.test(K))
3892 continue;
3893 if (getOperand(K) == TE.getOperand(I)) {
3894 Used.set(K);
3895 break;
3896 }
3897 }
3898 // Check if we actually found the matching operand.
3899 if (PrevCount == Used.count())
3900 return false;
3901 }
3902 return true;
3903 }
3904
3905 /// \return Final vectorization factor for the node. Defined by the total
3906 /// number of vectorized scalars, including those, used several times in the
3907 /// entry and counted in the \a ReuseShuffleIndices, if any.
3908 unsigned getVectorFactor() const {
3909 if (!ReuseShuffleIndices.empty())
3910 return ReuseShuffleIndices.size();
3911 return Scalars.size();
3912 };
3913
3914 /// Checks if the current node is a gather node.
3915 bool isGather() const { return State == NeedToGather; }
3916
3917 /// A vector of scalars.
3918 ValueList Scalars;
3919
3920 /// The Scalars are vectorized into this value. It is initialized to Null.
3921 WeakTrackingVH VectorizedValue = nullptr;
3922
3923 /// Do we need to gather this sequence or vectorize it
3924 /// (either with vector instruction or with scatter/gather
3925 /// intrinsics for store/load)?
3926 enum EntryState {
3927 Vectorize, ///< The node is regularly vectorized.
3928 ScatterVectorize, ///< Masked scatter/gather node.
3929 StridedVectorize, ///< Strided loads (and stores)
3930 CompressVectorize, ///< (Masked) load with compress.
3931 NeedToGather, ///< Gather/buildvector node.
3932 CombinedVectorize, ///< Vectorized node, combined with its user into more
3933 ///< complex node like select/cmp to minmax, mul/add to
3934 ///< fma, etc. Must be used for the following nodes in
3935 ///< the pattern, not the very first one.
3936 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
3937 ///< independently and then combines back.
3938 };
3939 EntryState State;
3940
3941 /// List of combined opcodes supported by the vectorizer.
3942 enum CombinedOpcode {
3943 NotCombinedOp = -1,
3944 MinMax = Instruction::OtherOpsEnd + 1,
3945 FMulAdd,
3946 };
3947 CombinedOpcode CombinedOp = NotCombinedOp;
3948
3949 /// Does this sequence require some shuffling?
3950 SmallVector<int, 4> ReuseShuffleIndices;
3951
3952 /// Does this entry require reordering?
3953 SmallVector<unsigned, 4> ReorderIndices;
3954
3955 /// Points back to the VectorizableTree.
3956 ///
3957 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3958 /// to be a pointer and needs to be able to initialize the child iterator.
3959 /// Thus we need a reference back to the container to translate the indices
3960 /// to entries.
3961 VecTreeTy &Container;
3962
3963 /// The TreeEntry index containing the user of this entry.
3964 EdgeInfo UserTreeIndex;
3965
3966 /// The index of this treeEntry in VectorizableTree.
3967 unsigned Idx = 0;
3968
3969 /// For gather/buildvector/alt opcode nodes, which are combined from
3970 /// other nodes as a series of insertvector instructions.
3971 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3972
3973 private:
3974 /// The operands of each instruction in each lane Operands[op_index][lane].
3975 /// Note: This helps avoid the replication of the code that performs the
3976 /// reordering of operands during buildTreeRec() and vectorizeTree().
3978
3979 /// Copyable elements of the entry node.
3980 SmallPtrSet<const Value *, 4> CopyableElements;
3981
3982 /// MainOp and AltOp are recorded inside. S should be obtained from
3983 /// newTreeEntry.
3984 InstructionsState S = InstructionsState::invalid();
3985
3986 /// Interleaving factor for interleaved loads Vectorize nodes.
3987 unsigned InterleaveFactor = 0;
3988
3989 /// True if the node does not require scheduling.
3990 bool DoesNotNeedToSchedule = false;
3991
3992 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3993 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3994 if (Operands.size() < OpIdx + 1)
3995 Operands.resize(OpIdx + 1);
3996 assert(Operands[OpIdx].empty() && "Already resized?");
3997 assert(OpVL.size() <= Scalars.size() &&
3998 "Number of operands is greater than the number of scalars.");
3999 Operands[OpIdx].resize(OpVL.size());
4000 copy(OpVL, Operands[OpIdx].begin());
4001 }
4002
4003 public:
4004 /// Returns interleave factor for interleave nodes.
4005 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4006 /// Sets interleaving factor for the interleaving nodes.
4007 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4008
4009 /// Marks the node as one that does not require scheduling.
4010 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4011 /// Returns true if the node is marked as one that does not require
4012 /// scheduling.
4013 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4014
4015 /// Set this bundle's operands from \p Operands.
4016 void setOperands(ArrayRef<ValueList> Operands) {
4017 for (unsigned I : seq<unsigned>(Operands.size()))
4018 setOperand(I, Operands[I]);
4019 }
4020
4021 /// Reorders operands of the node to the given mask \p Mask.
4022 void reorderOperands(ArrayRef<int> Mask) {
4023 for (ValueList &Operand : Operands)
4024 reorderScalars(Operand, Mask);
4025 }
4026
4027 /// \returns the \p OpIdx operand of this TreeEntry.
4028 ValueList &getOperand(unsigned OpIdx) {
4029 assert(OpIdx < Operands.size() && "Off bounds");
4030 return Operands[OpIdx];
4031 }
4032
4033 /// \returns the \p OpIdx operand of this TreeEntry.
4034 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4035 assert(OpIdx < Operands.size() && "Off bounds");
4036 return Operands[OpIdx];
4037 }
4038
4039 /// \returns the number of operands.
4040 unsigned getNumOperands() const { return Operands.size(); }
4041
4042 /// \return the single \p OpIdx operand.
4043 Value *getSingleOperand(unsigned OpIdx) const {
4044 assert(OpIdx < Operands.size() && "Off bounds");
4045 assert(!Operands[OpIdx].empty() && "No operand available");
4046 return Operands[OpIdx][0];
4047 }
4048
4049 /// Some of the instructions in the list have alternate opcodes.
4050 bool isAltShuffle() const { return S.isAltShuffle(); }
4051
4052 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4053 return S.getMatchingMainOpOrAltOp(I);
4054 }
4055
4056 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4057 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4058 /// \p OpValue.
4059 Value *isOneOf(Value *Op) const {
4060 auto *I = dyn_cast<Instruction>(Op);
4061 if (I && getMatchingMainOpOrAltOp(I))
4062 return Op;
4063 return S.getMainOp();
4064 }
4065
4066 void setOperations(const InstructionsState &S) {
4067 assert(S && "InstructionsState is invalid.");
4068 this->S = S;
4069 }
4070
4071 Instruction *getMainOp() const { return S.getMainOp(); }
4072
4073 Instruction *getAltOp() const { return S.getAltOp(); }
4074
4075 /// The main/alternate opcodes for the list of instructions.
4076 unsigned getOpcode() const { return S.getOpcode(); }
4077
4078 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4079
4080 bool hasState() const { return S.valid(); }
4081
4082 /// Add \p V to the list of copyable elements.
4083 void addCopyableElement(Value *V) {
4084 assert(S.isCopyableElement(V) && "Not a copyable element.");
4085 CopyableElements.insert(V);
4086 }
4087
4088 /// Returns true if \p V is a copyable element.
4089 bool isCopyableElement(Value *V) const {
4090 return CopyableElements.contains(V);
4091 }
4092
4093 /// Returns true if any scalar in the list is a copyable element.
4094 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4095
4096 /// Returns the state of the operations.
4097 const InstructionsState &getOperations() const { return S; }
4098
4099 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4100 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4101 unsigned findLaneForValue(Value *V) const {
4102 unsigned FoundLane = getVectorFactor();
4103 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4104 std::advance(It, 1)) {
4105 if (*It != V)
4106 continue;
4107 FoundLane = std::distance(Scalars.begin(), It);
4108 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4109 if (!ReorderIndices.empty())
4110 FoundLane = ReorderIndices[FoundLane];
4111 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4112 if (ReuseShuffleIndices.empty())
4113 break;
4114 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4115 RIt != ReuseShuffleIndices.end()) {
4116 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4117 break;
4118 }
4119 }
4120 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4121 return FoundLane;
4122 }
4123
4124 /// Build a shuffle mask for graph entry which represents a merge of main
4125 /// and alternate operations.
4126 void
4127 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4128 SmallVectorImpl<int> &Mask,
4129 SmallVectorImpl<Value *> *OpScalars = nullptr,
4130 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4131
4132 /// Return true if this is a non-power-of-2 node.
4133 bool isNonPowOf2Vec() const {
4134 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4135 return IsNonPowerOf2;
4136 }
4137
4138 /// Return true if this is a node, which tries to vectorize number of
4139 /// elements, forming whole vectors.
4140 bool
4141 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4142 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4143 TTI, getValueType(Scalars.front()), Scalars.size());
4144 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4145 "Reshuffling not supported with non-power-of-2 vectors yet.");
4146 return IsNonPowerOf2;
4147 }
4148
4149 Value *getOrdered(unsigned Idx) const {
4150 assert(isGather() && "Must be used only for buildvectors/gathers.");
4151 if (ReorderIndices.empty())
4152 return Scalars[Idx];
4153 SmallVector<int> Mask;
4154 inversePermutation(ReorderIndices, Mask);
4155 return Scalars[Mask[Idx]];
4156 }
4157
4158#ifndef NDEBUG
4159 /// Debug printer.
4160 LLVM_DUMP_METHOD void dump() const {
4161 dbgs() << Idx << ".\n";
4162 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4163 dbgs() << "Operand " << OpI << ":\n";
4164 for (const Value *V : Operands[OpI])
4165 dbgs().indent(2) << *V << "\n";
4166 }
4167 dbgs() << "Scalars: \n";
4168 for (Value *V : Scalars)
4169 dbgs().indent(2) << *V << "\n";
4170 dbgs() << "State: ";
4171 if (S && hasCopyableElements())
4172 dbgs() << "[[Copyable]] ";
4173 switch (State) {
4174 case Vectorize:
4175 if (InterleaveFactor > 0) {
4176 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4177 << "\n";
4178 } else {
4179 dbgs() << "Vectorize\n";
4180 }
4181 break;
4182 case ScatterVectorize:
4183 dbgs() << "ScatterVectorize\n";
4184 break;
4185 case StridedVectorize:
4186 dbgs() << "StridedVectorize\n";
4187 break;
4188 case CompressVectorize:
4189 dbgs() << "CompressVectorize\n";
4190 break;
4191 case NeedToGather:
4192 dbgs() << "NeedToGather\n";
4193 break;
4194 case CombinedVectorize:
4195 dbgs() << "CombinedVectorize\n";
4196 break;
4197 case SplitVectorize:
4198 dbgs() << "SplitVectorize\n";
4199 break;
4200 }
4201 if (S) {
4202 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4203 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4204 } else {
4205 dbgs() << "MainOp: NULL\n";
4206 dbgs() << "AltOp: NULL\n";
4207 }
4208 dbgs() << "VectorizedValue: ";
4209 if (VectorizedValue)
4210 dbgs() << *VectorizedValue << "\n";
4211 else
4212 dbgs() << "NULL\n";
4213 dbgs() << "ReuseShuffleIndices: ";
4214 if (ReuseShuffleIndices.empty())
4215 dbgs() << "Empty";
4216 else
4217 for (int ReuseIdx : ReuseShuffleIndices)
4218 dbgs() << ReuseIdx << ", ";
4219 dbgs() << "\n";
4220 dbgs() << "ReorderIndices: ";
4221 for (unsigned ReorderIdx : ReorderIndices)
4222 dbgs() << ReorderIdx << ", ";
4223 dbgs() << "\n";
4224 dbgs() << "UserTreeIndex: ";
4225 if (UserTreeIndex)
4226 dbgs() << UserTreeIndex;
4227 else
4228 dbgs() << "<invalid>";
4229 dbgs() << "\n";
4230 if (!CombinedEntriesWithIndices.empty()) {
4231 dbgs() << "Combined entries: ";
4232 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4233 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4234 });
4235 dbgs() << "\n";
4236 }
4237 }
4238#endif
4239 };
4240
4241#ifndef NDEBUG
4242 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4243 InstructionCost VecCost, InstructionCost ScalarCost,
4244 StringRef Banner) const {
4245 dbgs() << "SLP: " << Banner << ":\n";
4246 E->dump();
4247 dbgs() << "SLP: Costs:\n";
4248 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4249 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4250 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4251 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4252 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4253 }
4254#endif
4255
4256 /// Create a new gather TreeEntry
4257 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4258 const InstructionsState &S,
4259 const EdgeInfo &UserTreeIdx,
4260 ArrayRef<int> ReuseShuffleIndices = {}) {
4261 auto Invalid = ScheduleBundle::invalid();
4262 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4263 }
4264
4265 /// Create a new VectorizableTree entry.
4266 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4267 const InstructionsState &S,
4268 const EdgeInfo &UserTreeIdx,
4269 ArrayRef<int> ReuseShuffleIndices = {},
4270 ArrayRef<unsigned> ReorderIndices = {},
4271 unsigned InterleaveFactor = 0) {
4272 TreeEntry::EntryState EntryState =
4273 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4274 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4275 ReuseShuffleIndices, ReorderIndices);
4276 if (E && InterleaveFactor > 0)
4277 E->setInterleave(InterleaveFactor);
4278 return E;
4279 }
4280
4281 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4282 TreeEntry::EntryState EntryState,
4283 ScheduleBundle &Bundle, const InstructionsState &S,
4284 const EdgeInfo &UserTreeIdx,
4285 ArrayRef<int> ReuseShuffleIndices = {},
4286 ArrayRef<unsigned> ReorderIndices = {}) {
4287 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4288 EntryState == TreeEntry::SplitVectorize)) ||
4289 (Bundle && EntryState != TreeEntry::NeedToGather &&
4290 EntryState != TreeEntry::SplitVectorize)) &&
4291 "Need to vectorize gather entry?");
4292 // Gathered loads still gathered? Do not create entry, use the original one.
4293 if (GatheredLoadsEntriesFirst.has_value() &&
4294 EntryState == TreeEntry::NeedToGather && S &&
4295 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4296 !UserTreeIdx.UserTE)
4297 return nullptr;
4298 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4299 TreeEntry *Last = VectorizableTree.back().get();
4300 Last->Idx = VectorizableTree.size() - 1;
4301 Last->State = EntryState;
4302 if (UserTreeIdx.UserTE)
4303 OperandsToTreeEntry.try_emplace(
4304 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4305 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4306 // for non-power-of-two vectors.
4307 assert(
4308 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4309 ReuseShuffleIndices.empty()) &&
4310 "Reshuffling scalars not yet supported for nodes with padding");
4311 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4312 ReuseShuffleIndices.end());
4313 if (ReorderIndices.empty()) {
4314 Last->Scalars.assign(VL.begin(), VL.end());
4315 if (S)
4316 Last->setOperations(S);
4317 } else {
4318 // Reorder scalars and build final mask.
4319 Last->Scalars.assign(VL.size(), nullptr);
4320 transform(ReorderIndices, Last->Scalars.begin(),
4321 [VL](unsigned Idx) -> Value * {
4322 if (Idx >= VL.size())
4323 return UndefValue::get(VL.front()->getType());
4324 return VL[Idx];
4325 });
4326 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4327 if (S)
4328 Last->setOperations(S);
4329 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4330 }
4331 if (EntryState == TreeEntry::SplitVectorize) {
4332 assert(S && "Split nodes must have operations.");
4333 Last->setOperations(S);
4334 SmallPtrSet<Value *, 4> Processed;
4335 for (Value *V : VL) {
4336 auto *I = dyn_cast<Instruction>(V);
4337 if (!I)
4338 continue;
4339 auto It = ScalarsInSplitNodes.find(V);
4340 if (It == ScalarsInSplitNodes.end()) {
4341 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4342 (void)Processed.insert(V);
4343 } else if (Processed.insert(V).second) {
4344 assert(!is_contained(It->getSecond(), Last) &&
4345 "Value already associated with the node.");
4346 It->getSecond().push_back(Last);
4347 }
4348 }
4349 } else if (!Last->isGather()) {
4350 if (isa<PHINode>(S.getMainOp()) ||
4351 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4352 (!S.areInstructionsWithCopyableElements() &&
4353 doesNotNeedToSchedule(VL)) ||
4354 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4355 Last->setDoesNotNeedToSchedule();
4356 SmallPtrSet<Value *, 4> Processed;
4357 for (Value *V : VL) {
4358 if (isa<PoisonValue>(V))
4359 continue;
4360 if (S.isCopyableElement(V)) {
4361 Last->addCopyableElement(V);
4362 continue;
4363 }
4364 auto It = ScalarToTreeEntries.find(V);
4365 if (It == ScalarToTreeEntries.end()) {
4366 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4367 (void)Processed.insert(V);
4368 } else if (Processed.insert(V).second) {
4369 assert(!is_contained(It->getSecond(), Last) &&
4370 "Value already associated with the node.");
4371 It->getSecond().push_back(Last);
4372 }
4373 }
4374 // Update the scheduler bundle to point to this TreeEntry.
4375 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4376 "Bundle and VL out of sync");
4377 if (!Bundle.getBundle().empty()) {
4378#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4379 auto *BundleMember = Bundle.getBundle().begin();
4380 SmallPtrSet<Value *, 4> Processed;
4381 for (Value *V : VL) {
4382 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4383 continue;
4384 ++BundleMember;
4385 }
4386 assert(BundleMember == Bundle.getBundle().end() &&
4387 "Bundle and VL out of sync");
4388#endif
4389 Bundle.setTreeEntry(Last);
4390 }
4391 } else {
4392 // Build a map for gathered scalars to the nodes where they are used.
4393 bool AllConstsOrCasts = true;
4394 for (Value *V : VL) {
4395 if (S && S.areInstructionsWithCopyableElements() &&
4396 S.isCopyableElement(V))
4397 Last->addCopyableElement(V);
4398 if (!isConstant(V)) {
4399 auto *I = dyn_cast<CastInst>(V);
4400 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4401 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4402 !UserTreeIdx.UserTE->isGather())
4403 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4404 }
4405 }
4406 if (AllConstsOrCasts)
4407 CastMaxMinBWSizes =
4408 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4409 MustGather.insert_range(VL);
4410 }
4411
4412 if (UserTreeIdx.UserTE)
4413 Last->UserTreeIndex = UserTreeIdx;
4414 return Last;
4415 }
4416
4417 /// -- Vectorization State --
4418 /// Holds all of the tree entries.
4419 TreeEntry::VecTreeTy VectorizableTree;
4420
4421#ifndef NDEBUG
4422 /// Debug printer.
4423 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4424 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4425 VectorizableTree[Id]->dump();
4426 dbgs() << "\n";
4427 }
4428 }
4429#endif
4430
4431 /// Get list of vector entries, associated with the value \p V.
4432 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4433 assert(V && "V cannot be nullptr.");
4434 auto It = ScalarToTreeEntries.find(V);
4435 if (It == ScalarToTreeEntries.end())
4436 return {};
4437 return It->getSecond();
4438 }
4439
4440 /// Get list of split vector entries, associated with the value \p V.
4441 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4442 assert(V && "V cannot be nullptr.");
4443 auto It = ScalarsInSplitNodes.find(V);
4444 if (It == ScalarsInSplitNodes.end())
4445 return {};
4446 return It->getSecond();
4447 }
4448
4449 /// Returns first vector node for value \p V, matching values \p VL.
4450 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4451 bool SameVF = false) const {
4452 assert(V && "V cannot be nullptr.");
4453 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4454 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4455 return TE;
4456 return nullptr;
4457 }
4458
4459 /// Check that the operand node of alternate node does not generate
4460 /// buildvector sequence. If it is, then probably not worth it to build
4461 /// alternate shuffle, if number of buildvector operands + alternate
4462 /// instruction > than the number of buildvector instructions.
4463 /// \param S the instructions state of the analyzed values.
4464 /// \param VL list of the instructions with alternate opcodes.
4465 bool areAltOperandsProfitable(const InstructionsState &S,
4466 ArrayRef<Value *> VL) const;
4467
4468 /// Contains all the outputs of legality analysis for a list of values to
4469 /// vectorize.
4470 class ScalarsVectorizationLegality {
4471 InstructionsState S;
4472 bool IsLegal;
4473 bool TryToFindDuplicates;
4474 bool TrySplitVectorize;
4475
4476 public:
4477 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4478 bool TryToFindDuplicates = true,
4479 bool TrySplitVectorize = false)
4480 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4481 TrySplitVectorize(TrySplitVectorize) {
4482 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4483 "Inconsistent state");
4484 }
4485 const InstructionsState &getInstructionsState() const { return S; };
4486 bool isLegal() const { return IsLegal; }
4487 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4488 bool trySplitVectorize() const { return TrySplitVectorize; }
4489 };
4490
4491 /// Checks if the specified list of the instructions/values can be vectorized
4492 /// in general.
4493 ScalarsVectorizationLegality
4494 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4495 const EdgeInfo &UserTreeIdx,
4496 bool TryCopyableElementsVectorization) const;
4497
4498 /// Checks if the specified list of the instructions/values can be vectorized
4499 /// and fills required data before actual scheduling of the instructions.
4500 TreeEntry::EntryState getScalarsVectorizationState(
4501 const InstructionsState &S, ArrayRef<Value *> VL,
4502 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4503 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4504
4505 /// Maps a specific scalar to its tree entry(ies).
4506 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4507
4508 /// Maps the operand index and entry to the corresponding tree entry.
4509 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4510 OperandsToTreeEntry;
4511
4512 /// Scalars, used in split vectorize nodes.
4513 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4514
4515 /// Maps a value to the proposed vectorizable size.
4516 SmallDenseMap<Value *, unsigned> InstrElementSize;
4517
4518 /// A list of scalars that we found that we need to keep as scalars.
4519 ValueSet MustGather;
4520
4521 /// A set of first non-schedulable values.
4522 ValueSet NonScheduledFirst;
4523
4524 /// A map between the vectorized entries and the last instructions in the
4525 /// bundles. The bundles are built in use order, not in the def order of the
4526 /// instructions. So, we cannot rely directly on the last instruction in the
4527 /// bundle being the last instruction in the program order during
4528 /// vectorization process since the basic blocks are affected, need to
4529 /// pre-gather them before.
4530 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4531
4532 /// List of gather nodes, depending on other gather/vector nodes, which should
4533 /// be emitted after the vector instruction emission process to correctly
4534 /// handle order of the vector instructions and shuffles.
4535 SetVector<const TreeEntry *> PostponedGathers;
4536
4537 using ValueToGatherNodesMap =
4538 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4539 ValueToGatherNodesMap ValueToGatherNodes;
4540
4541 /// A list of the load entries (node indices), which can be vectorized using
4542 /// strided or masked gather approach, but attempted to be represented as
4543 /// contiguous loads.
4544 SetVector<unsigned> LoadEntriesToVectorize;
4545
4546 /// true if graph nodes transforming mode is on.
4547 bool IsGraphTransformMode = false;
4548
4549 /// The index of the first gathered load entry in the VectorizeTree.
4550 std::optional<unsigned> GatheredLoadsEntriesFirst;
4551
4552 /// Maps compress entries to their mask data for the final codegen.
4553 SmallDenseMap<const TreeEntry *,
4554 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4555 CompressEntryToData;
4556
4557 /// This POD struct describes one external user in the vectorized tree.
4558 struct ExternalUser {
4559 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4560 : Scalar(S), User(U), E(E), Lane(L) {}
4561
4562 /// Which scalar in our function.
4563 Value *Scalar = nullptr;
4564
4565 /// Which user that uses the scalar.
4566 llvm::User *User = nullptr;
4567
4568 /// Vector node, the value is part of.
4569 const TreeEntry &E;
4570
4571 /// Which lane does the scalar belong to.
4572 unsigned Lane;
4573 };
4574 using UserList = SmallVector<ExternalUser, 16>;
4575
4576 /// Checks if two instructions may access the same memory.
4577 ///
4578 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4579 /// is invariant in the calling loop.
4580 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4581 Instruction *Inst2) {
4582 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4583 // First check if the result is already in the cache.
4584 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4585 auto Res = AliasCache.try_emplace(Key);
4586 if (!Res.second)
4587 return Res.first->second;
4588 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4589 // Store the result in the cache.
4590 Res.first->getSecond() = Aliased;
4591 return Aliased;
4592 }
4593
4594 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4595
4596 /// Cache for alias results.
4597 /// TODO: consider moving this to the AliasAnalysis itself.
4598 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4599
4600 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4601 // globally through SLP because we don't perform any action which
4602 // invalidates capture results.
4603 BatchAAResults BatchAA;
4604
4605 /// Temporary store for deleted instructions. Instructions will be deleted
4606 /// eventually when the BoUpSLP is destructed. The deferral is required to
4607 /// ensure that there are no incorrect collisions in the AliasCache, which
4608 /// can happen if a new instruction is allocated at the same address as a
4609 /// previously deleted instruction.
4610 DenseSet<Instruction *> DeletedInstructions;
4611
4612 /// Set of the instruction, being analyzed already for reductions.
4613 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4614
4615 /// Set of hashes for the list of reduction values already being analyzed.
4616 DenseSet<size_t> AnalyzedReductionVals;
4617
4618 /// Values, already been analyzed for mininmal bitwidth and found to be
4619 /// non-profitable.
4620 DenseSet<Value *> AnalyzedMinBWVals;
4621
4622 /// A list of values that need to extracted out of the tree.
4623 /// This list holds pairs of (Internal Scalar : External User). External User
4624 /// can be nullptr, it means that this Internal Scalar will be used later,
4625 /// after vectorization.
4626 UserList ExternalUses;
4627
4628 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4629 /// extractelement instructions.
4630 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4631
4632 /// A list of scalar to be extracted without specific user necause of too many
4633 /// uses.
4634 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4635
4636 /// Values used only by @llvm.assume calls.
4637 SmallPtrSet<const Value *, 32> EphValues;
4638
4639 /// Holds all of the instructions that we gathered, shuffle instructions and
4640 /// extractelements.
4641 SetVector<Instruction *> GatherShuffleExtractSeq;
4642
4643 /// A list of blocks that we are going to CSE.
4644 DenseSet<BasicBlock *> CSEBlocks;
4645
4646 /// List of hashes of vector of loads, which are known to be non vectorizable.
4647 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4648
4649 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4650 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4651 /// instructions, while ScheduleBundle represents a batch of instructions,
4652 /// going to be groupped together. ScheduleCopyableData models extra user for
4653 /// "copyable" instructions.
4654 class ScheduleEntity {
4655 friend class ScheduleBundle;
4656 friend class ScheduleData;
4657 friend class ScheduleCopyableData;
4658
4659 protected:
4660 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4661 Kind getKind() const { return K; }
4662 ScheduleEntity(Kind K) : K(K) {}
4663
4664 private:
4665 /// Used for getting a "good" final ordering of instructions.
4666 int SchedulingPriority = 0;
4667 /// True if this instruction (or bundle) is scheduled (or considered as
4668 /// scheduled in the dry-run).
4669 bool IsScheduled = false;
4670 /// The kind of the ScheduleEntity.
4671 const Kind K = Kind::ScheduleData;
4672
4673 public:
4674 ScheduleEntity() = delete;
4675 /// Gets/sets the scheduling priority.
4676 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4677 int getSchedulingPriority() const { return SchedulingPriority; }
4678 bool isReady() const {
4679 if (const auto *SD = dyn_cast<ScheduleData>(this))
4680 return SD->isReady();
4681 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4682 return CD->isReady();
4683 return cast<ScheduleBundle>(this)->isReady();
4684 }
4685 /// Returns true if the dependency information has been calculated.
4686 /// Note that depenendency validity can vary between instructions within
4687 /// a single bundle.
4688 bool hasValidDependencies() const {
4689 if (const auto *SD = dyn_cast<ScheduleData>(this))
4690 return SD->hasValidDependencies();
4691 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4692 return CD->hasValidDependencies();
4693 return cast<ScheduleBundle>(this)->hasValidDependencies();
4694 }
4695 /// Gets the number of unscheduled dependencies.
4696 int getUnscheduledDeps() const {
4697 if (const auto *SD = dyn_cast<ScheduleData>(this))
4698 return SD->getUnscheduledDeps();
4699 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4700 return CD->getUnscheduledDeps();
4701 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4702 }
4703 /// Increments the number of unscheduled dependencies.
4704 int incrementUnscheduledDeps(int Incr) {
4705 if (auto *SD = dyn_cast<ScheduleData>(this))
4706 return SD->incrementUnscheduledDeps(Incr);
4707 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4708 }
4709 /// Gets the number of dependencies.
4710 int getDependencies() const {
4711 if (const auto *SD = dyn_cast<ScheduleData>(this))
4712 return SD->getDependencies();
4713 return cast<ScheduleCopyableData>(this)->getDependencies();
4714 }
4715 /// Gets the instruction.
4716 Instruction *getInst() const {
4717 if (const auto *SD = dyn_cast<ScheduleData>(this))
4718 return SD->getInst();
4719 return cast<ScheduleCopyableData>(this)->getInst();
4720 }
4721
4722 /// Gets/sets if the bundle is scheduled.
4723 bool isScheduled() const { return IsScheduled; }
4724 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4725
4726 static bool classof(const ScheduleEntity *) { return true; }
4727
4728#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4729 void dump(raw_ostream &OS) const {
4730 if (const auto *SD = dyn_cast<ScheduleData>(this))
4731 return SD->dump(OS);
4732 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4733 return CD->dump(OS);
4734 return cast<ScheduleBundle>(this)->dump(OS);
4735 }
4736
4737 LLVM_DUMP_METHOD void dump() const {
4738 dump(dbgs());
4739 dbgs() << '\n';
4740 }
4741#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4742 };
4743
4744#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4746 const BoUpSLP::ScheduleEntity &SE) {
4747 SE.dump(OS);
4748 return OS;
4749 }
4750#endif
4751
4752 /// Contains all scheduling relevant data for an instruction.
4753 /// A ScheduleData either represents a single instruction or a member of an
4754 /// instruction bundle (= a group of instructions which is combined into a
4755 /// vector instruction).
4756 class ScheduleData final : public ScheduleEntity {
4757 public:
4758 // The initial value for the dependency counters. It means that the
4759 // dependencies are not calculated yet.
4760 enum { InvalidDeps = -1 };
4761
4762 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4763 static bool classof(const ScheduleEntity *Entity) {
4764 return Entity->getKind() == Kind::ScheduleData;
4765 }
4766
4767 void init(int BlockSchedulingRegionID, Instruction *I) {
4768 NextLoadStore = nullptr;
4769 IsScheduled = false;
4770 SchedulingRegionID = BlockSchedulingRegionID;
4771 clearDependencies();
4772 Inst = I;
4773 }
4774
4775 /// Verify basic self consistency properties
4776 void verify() {
4777 if (hasValidDependencies()) {
4778 assert(UnscheduledDeps <= Dependencies && "invariant");
4779 } else {
4780 assert(UnscheduledDeps == Dependencies && "invariant");
4781 }
4782
4783 if (IsScheduled) {
4784 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4785 "unexpected scheduled state");
4786 }
4787 }
4788
4789 /// Returns true if the dependency information has been calculated.
4790 /// Note that depenendency validity can vary between instructions within
4791 /// a single bundle.
4792 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4793
4794 /// Returns true if it is ready for scheduling, i.e. it has no more
4795 /// unscheduled depending instructions/bundles.
4796 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4797
4798 /// Modifies the number of unscheduled dependencies for this instruction,
4799 /// and returns the number of remaining dependencies for the containing
4800 /// bundle.
4801 int incrementUnscheduledDeps(int Incr) {
4802 assert(hasValidDependencies() &&
4803 "increment of unscheduled deps would be meaningless");
4804 UnscheduledDeps += Incr;
4805 return UnscheduledDeps;
4806 }
4807
4808 /// Sets the number of unscheduled dependencies to the number of
4809 /// dependencies.
4810 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4811
4812 /// Clears all dependency information.
4813 void clearDependencies() {
4814 clearDirectDependencies();
4815 MemoryDependencies.clear();
4816 ControlDependencies.clear();
4817 }
4818
4819 /// Clears all direct dependencies only, except for control and memory
4820 /// dependencies.
4821 /// Required for copyable elements to correctly handle control/memory deps
4822 /// and avoid extra reclaculation of such deps.
4823 void clearDirectDependencies() {
4824 Dependencies = InvalidDeps;
4825 resetUnscheduledDeps();
4826 IsScheduled = false;
4827 }
4828
4829 /// Gets the number of unscheduled dependencies.
4830 int getUnscheduledDeps() const { return UnscheduledDeps; }
4831 /// Gets the number of dependencies.
4832 int getDependencies() const { return Dependencies; }
4833 /// Initializes the number of dependencies.
4834 void initDependencies() { Dependencies = 0; }
4835 /// Increments the number of dependencies.
4836 void incDependencies() { Dependencies++; }
4837
4838 /// Gets scheduling region ID.
4839 int getSchedulingRegionID() const { return SchedulingRegionID; }
4840
4841 /// Gets the instruction.
4842 Instruction *getInst() const { return Inst; }
4843
4844 /// Gets the list of memory dependencies.
4845 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4846 return MemoryDependencies;
4847 }
4848 /// Adds a memory dependency.
4849 void addMemoryDependency(ScheduleData *Dep) {
4850 MemoryDependencies.push_back(Dep);
4851 }
4852 /// Gets the list of control dependencies.
4853 ArrayRef<ScheduleData *> getControlDependencies() const {
4854 return ControlDependencies;
4855 }
4856 /// Adds a control dependency.
4857 void addControlDependency(ScheduleData *Dep) {
4858 ControlDependencies.push_back(Dep);
4859 }
4860 /// Gets/sets the next load/store instruction in the block.
4861 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4862 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4863
4864 void dump(raw_ostream &OS) const { OS << *Inst; }
4865
4866 LLVM_DUMP_METHOD void dump() const {
4867 dump(dbgs());
4868 dbgs() << '\n';
4869 }
4870
4871 private:
4872 Instruction *Inst = nullptr;
4873
4874 /// Single linked list of all memory instructions (e.g. load, store, call)
4875 /// in the block - until the end of the scheduling region.
4876 ScheduleData *NextLoadStore = nullptr;
4877
4878 /// The dependent memory instructions.
4879 /// This list is derived on demand in calculateDependencies().
4880 SmallVector<ScheduleData *> MemoryDependencies;
4881
4882 /// List of instructions which this instruction could be control dependent
4883 /// on. Allowing such nodes to be scheduled below this one could introduce
4884 /// a runtime fault which didn't exist in the original program.
4885 /// ex: this is a load or udiv following a readonly call which inf loops
4886 SmallVector<ScheduleData *> ControlDependencies;
4887
4888 /// This ScheduleData is in the current scheduling region if this matches
4889 /// the current SchedulingRegionID of BlockScheduling.
4890 int SchedulingRegionID = 0;
4891
4892 /// The number of dependencies. Constitutes of the number of users of the
4893 /// instruction plus the number of dependent memory instructions (if any).
4894 /// This value is calculated on demand.
4895 /// If InvalidDeps, the number of dependencies is not calculated yet.
4896 int Dependencies = InvalidDeps;
4897
4898 /// The number of dependencies minus the number of dependencies of scheduled
4899 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4900 /// for scheduling.
4901 /// Note that this is negative as long as Dependencies is not calculated.
4902 int UnscheduledDeps = InvalidDeps;
4903 };
4904
4905#ifndef NDEBUG
4907 const BoUpSLP::ScheduleData &SD) {
4908 SD.dump(OS);
4909 return OS;
4910 }
4911#endif
4912
4913 class ScheduleBundle final : public ScheduleEntity {
4914 /// The schedule data for the instructions in the bundle.
4916 /// True if this bundle is valid.
4917 bool IsValid = true;
4918 /// The TreeEntry that this instruction corresponds to.
4919 TreeEntry *TE = nullptr;
4920 ScheduleBundle(bool IsValid)
4921 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4922
4923 public:
4924 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4925 static bool classof(const ScheduleEntity *Entity) {
4926 return Entity->getKind() == Kind::ScheduleBundle;
4927 }
4928
4929 /// Verify basic self consistency properties
4930 void verify() const {
4931 for (const ScheduleEntity *SD : Bundle) {
4932 if (SD->hasValidDependencies()) {
4933 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4934 "invariant");
4935 } else {
4936 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4937 "invariant");
4938 }
4939
4940 if (isScheduled()) {
4941 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4942 "unexpected scheduled state");
4943 }
4944 }
4945 }
4946
4947 /// Returns the number of unscheduled dependencies in the bundle.
4948 int unscheduledDepsInBundle() const {
4949 assert(*this && "bundle must not be empty");
4950 int Sum = 0;
4951 for (const ScheduleEntity *BundleMember : Bundle) {
4952 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4953 return ScheduleData::InvalidDeps;
4954 Sum += BundleMember->getUnscheduledDeps();
4955 }
4956 return Sum;
4957 }
4958
4959 /// Returns true if the dependency information has been calculated.
4960 /// Note that depenendency validity can vary between instructions within
4961 /// a single bundle.
4962 bool hasValidDependencies() const {
4963 return all_of(Bundle, [](const ScheduleEntity *SD) {
4964 return SD->hasValidDependencies();
4965 });
4966 }
4967
4968 /// Returns true if it is ready for scheduling, i.e. it has no more
4969 /// unscheduled depending instructions/bundles.
4970 bool isReady() const {
4971 assert(*this && "bundle must not be empty");
4972 return unscheduledDepsInBundle() == 0 && !isScheduled();
4973 }
4974
4975 /// Returns the bundle of scheduling data, associated with the current
4976 /// instruction.
4977 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
4978 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
4979 /// Adds an instruction to the bundle.
4980 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
4981
4982 /// Gets/sets the associated tree entry.
4983 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
4984 TreeEntry *getTreeEntry() const { return TE; }
4985
4986 static ScheduleBundle invalid() { return {false}; }
4987
4988 operator bool() const { return IsValid; }
4989
4990#ifndef NDEBUG
4991 void dump(raw_ostream &OS) const {
4992 if (!*this) {
4993 OS << "[]";
4994 return;
4995 }
4996 OS << '[';
4997 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
4999 OS << "<Copyable>";
5000 OS << *SD->getInst();
5001 });
5002 OS << ']';
5003 }
5004
5005 LLVM_DUMP_METHOD void dump() const {
5006 dump(dbgs());
5007 dbgs() << '\n';
5008 }
5009#endif // NDEBUG
5010 };
5011
5012#ifndef NDEBUG
5014 const BoUpSLP::ScheduleBundle &Bundle) {
5015 Bundle.dump(OS);
5016 return OS;
5017 }
5018#endif
5019
5020 /// Contains all scheduling relevant data for the copyable instruction.
5021 /// It models the virtual instructions, supposed to replace the original
5022 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5023 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5024 /// instruction %virt = add %0, 0.
5025 class ScheduleCopyableData final : public ScheduleEntity {
5026 /// The source schedule data for the instruction.
5027 Instruction *Inst = nullptr;
5028 /// The edge information for the instruction.
5029 const EdgeInfo EI;
5030 /// This ScheduleData is in the current scheduling region if this matches
5031 /// the current SchedulingRegionID of BlockScheduling.
5032 int SchedulingRegionID = 0;
5033 /// Bundle, this data is part of.
5034 ScheduleBundle &Bundle;
5035
5036 public:
5037 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5038 const EdgeInfo &EI, ScheduleBundle &Bundle)
5039 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5040 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5041 static bool classof(const ScheduleEntity *Entity) {
5042 return Entity->getKind() == Kind::ScheduleCopyableData;
5043 }
5044
5045 /// Verify basic self consistency properties
5046 void verify() {
5047 if (hasValidDependencies()) {
5048 assert(UnscheduledDeps <= Dependencies && "invariant");
5049 } else {
5050 assert(UnscheduledDeps == Dependencies && "invariant");
5051 }
5052
5053 if (IsScheduled) {
5054 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5055 "unexpected scheduled state");
5056 }
5057 }
5058
5059 /// Returns true if the dependency information has been calculated.
5060 /// Note that depenendency validity can vary between instructions within
5061 /// a single bundle.
5062 bool hasValidDependencies() const {
5063 return Dependencies != ScheduleData::InvalidDeps;
5064 }
5065
5066 /// Returns true if it is ready for scheduling, i.e. it has no more
5067 /// unscheduled depending instructions/bundles.
5068 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5069
5070 /// Modifies the number of unscheduled dependencies for this instruction,
5071 /// and returns the number of remaining dependencies for the containing
5072 /// bundle.
5073 int incrementUnscheduledDeps(int Incr) {
5074 assert(hasValidDependencies() &&
5075 "increment of unscheduled deps would be meaningless");
5076 UnscheduledDeps += Incr;
5077 assert(UnscheduledDeps >= 0 && "invariant");
5078 return UnscheduledDeps;
5079 }
5080
5081 /// Sets the number of unscheduled dependencies to the number of
5082 /// dependencies.
5083 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5084
5085 /// Gets the number of unscheduled dependencies.
5086 int getUnscheduledDeps() const { return UnscheduledDeps; }
5087 /// Gets the number of dependencies.
5088 int getDependencies() const { return Dependencies; }
5089 /// Initializes the number of dependencies.
5090 void initDependencies() { Dependencies = 0; }
5091 /// Increments the number of dependencies.
5092 void incDependencies() { Dependencies++; }
5093
5094 /// Gets scheduling region ID.
5095 int getSchedulingRegionID() const { return SchedulingRegionID; }
5096
5097 /// Gets the instruction.
5098 Instruction *getInst() const { return Inst; }
5099
5100 /// Clears all dependency information.
5101 void clearDependencies() {
5102 Dependencies = ScheduleData::InvalidDeps;
5103 UnscheduledDeps = ScheduleData::InvalidDeps;
5104 IsScheduled = false;
5105 }
5106
5107 /// Gets the edge information.
5108 const EdgeInfo &getEdgeInfo() const { return EI; }
5109
5110 /// Gets the bundle.
5111 ScheduleBundle &getBundle() { return Bundle; }
5112 const ScheduleBundle &getBundle() const { return Bundle; }
5113
5114#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5115 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5116
5117 LLVM_DUMP_METHOD void dump() const {
5118 dump(dbgs());
5119 dbgs() << '\n';
5120 }
5121#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5122
5123 private:
5124 /// true, if it has valid dependency information. These nodes always have
5125 /// only single dependency.
5126 int Dependencies = ScheduleData::InvalidDeps;
5127
5128 /// The number of dependencies minus the number of dependencies of scheduled
5129 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5130 /// for scheduling.
5131 /// Note that this is negative as long as Dependencies is not calculated.
5132 int UnscheduledDeps = ScheduleData::InvalidDeps;
5133 };
5134
5135#ifndef NDEBUG
5136 friend inline raw_ostream &
5137 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5138 SD.dump(OS);
5139 return OS;
5140 }
5141#endif
5142
5143 friend struct GraphTraits<BoUpSLP *>;
5144 friend struct DOTGraphTraits<BoUpSLP *>;
5145
5146 /// Contains all scheduling data for a basic block.
5147 /// It does not schedules instructions, which are not memory read/write
5148 /// instructions and their operands are either constants, or arguments, or
5149 /// phis, or instructions from others blocks, or their users are phis or from
5150 /// the other blocks. The resulting vector instructions can be placed at the
5151 /// beginning of the basic block without scheduling (if operands does not need
5152 /// to be scheduled) or at the end of the block (if users are outside of the
5153 /// block). It allows to save some compile time and memory used by the
5154 /// compiler.
5155 /// ScheduleData is assigned for each instruction in between the boundaries of
5156 /// the tree entry, even for those, which are not part of the graph. It is
5157 /// required to correctly follow the dependencies between the instructions and
5158 /// their correct scheduling. The ScheduleData is not allocated for the
5159 /// instructions, which do not require scheduling, like phis, nodes with
5160 /// extractelements/insertelements only or nodes with instructions, with
5161 /// uses/operands outside of the block.
5162 struct BlockScheduling {
5163 BlockScheduling(BasicBlock *BB)
5164 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5165
5166 void clear() {
5167 ScheduledBundles.clear();
5168 ScheduledBundlesList.clear();
5169 ScheduleCopyableDataMap.clear();
5170 ScheduleCopyableDataMapByInst.clear();
5171 ScheduleCopyableDataMapByInstUser.clear();
5172 ScheduleCopyableDataMapByUsers.clear();
5173 ReadyInsts.clear();
5174 ScheduleStart = nullptr;
5175 ScheduleEnd = nullptr;
5176 FirstLoadStoreInRegion = nullptr;
5177 LastLoadStoreInRegion = nullptr;
5178 RegionHasStackSave = false;
5179
5180 // Reduce the maximum schedule region size by the size of the
5181 // previous scheduling run.
5182 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5183 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5184 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5185 ScheduleRegionSize = 0;
5186
5187 // Make a new scheduling region, i.e. all existing ScheduleData is not
5188 // in the new region yet.
5189 ++SchedulingRegionID;
5190 }
5191
5192 ScheduleData *getScheduleData(Instruction *I) {
5193 if (!I)
5194 return nullptr;
5195 if (BB != I->getParent())
5196 // Avoid lookup if can't possibly be in map.
5197 return nullptr;
5198 ScheduleData *SD = ScheduleDataMap.lookup(I);
5199 if (SD && isInSchedulingRegion(*SD))
5200 return SD;
5201 return nullptr;
5202 }
5203
5204 ScheduleData *getScheduleData(Value *V) {
5205 return getScheduleData(dyn_cast<Instruction>(V));
5206 }
5207
5208 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5209 /// operand number) and value.
5210 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5211 const Value *V) const {
5212 if (ScheduleCopyableDataMap.empty())
5213 return nullptr;
5214 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5215 if (It == ScheduleCopyableDataMap.end())
5216 return nullptr;
5217 ScheduleCopyableData *SD = It->getSecond().get();
5218 if (!isInSchedulingRegion(*SD))
5219 return nullptr;
5220 return SD;
5221 }
5222
5223 /// Returns the ScheduleCopyableData for the given user \p User, operand
5224 /// number and operand \p V.
5226 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5227 const Value *V) {
5228 if (ScheduleCopyableDataMapByInstUser.empty())
5229 return {};
5230 const auto It = ScheduleCopyableDataMapByInstUser.find(
5231 std::make_pair(std::make_pair(User, OperandIdx), V));
5232 if (It == ScheduleCopyableDataMapByInstUser.end())
5233 return {};
5235 for (ScheduleCopyableData *SD : It->getSecond()) {
5236 if (isInSchedulingRegion(*SD))
5237 Res.push_back(SD);
5238 }
5239 return Res;
5240 }
5241
5242 /// Returns true if all operands of the given instruction \p User are
5243 /// replaced by copyable data.
5244 /// \param User The user instruction.
5245 /// \param Op The operand, which might be replaced by the copyable data.
5246 /// \param SLP The SLP tree.
5247 /// \param NumOps The number of operands used. If the instruction uses the
5248 /// same operand several times, check for the first use, then the second,
5249 /// etc.
5250 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5251 Instruction *Op, BoUpSLP &SLP,
5252 unsigned NumOps) const {
5253 assert(NumOps > 0 && "No operands");
5254 if (ScheduleCopyableDataMap.empty())
5255 return false;
5256 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5257 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5258 for (const Use &U : User->operands()) {
5259 if (U.get() != Op)
5260 continue;
5261 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5262 if (Entries.empty())
5263 return false;
5264 // Check all tree entries, if they have operands replaced by copyable
5265 // data.
5266 for (TreeEntry *TE : Entries) {
5267 // Check if the user is commutative.
5268 // The commutatives are handled later, as their oeprands can be
5269 // reordered.
5270 // Same applies even for non-commutative cmps, because we can invert
5271 // their predicate potentially and, thus, reorder the operands.
5272 bool IsCommutativeUser =
5273 ::isCommutative(User) ||
5274 ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
5275 EdgeInfo EI(TE, U.getOperandNo());
5276 if (!IsCommutativeUser && !isa<CmpInst>(User)) {
5277 unsigned &OpCnt =
5278 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
5279 if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)
5280 return false;
5281 // Found copyable operand - continue.
5282 ++OpCnt;
5283 continue;
5284 }
5285 ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5286 .first->getSecond();
5287 }
5288 }
5289 // Check the commutative/cmp entries.
5290 if (!PotentiallyReorderedEntriesCount.empty()) {
5291 for (auto &P : PotentiallyReorderedEntriesCount) {
5292 auto *It = find(P.first->Scalars, User);
5293 assert(It != P.first->Scalars.end() &&
5294 "User is not in the tree entry");
5295 int Lane = std::distance(P.first->Scalars.begin(), It);
5296 assert(Lane >= 0 && "Lane is not found");
5297 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5298 Lane = P.first->ReorderIndices[Lane];
5299 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5300 "Couldn't find extract lane");
5301 SmallVector<unsigned> OpIndices;
5302 for (unsigned OpIdx :
5304 P.first->getMainOp()))) {
5305 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5306 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5307 --P.getSecond();
5308 }
5309 }
5310 return all_of(PotentiallyReorderedEntriesCount,
5311 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5312 return P.second == NumOps - 1;
5313 });
5314 }
5315 return true;
5316 }
5317
5319 getScheduleCopyableData(const Instruction *I) const {
5320 if (ScheduleCopyableDataMapByInst.empty())
5321 return {};
5322 const auto It = ScheduleCopyableDataMapByInst.find(I);
5323 if (It == ScheduleCopyableDataMapByInst.end())
5324 return {};
5326 for (ScheduleCopyableData *SD : It->getSecond()) {
5327 if (isInSchedulingRegion(*SD))
5328 Res.push_back(SD);
5329 }
5330 return Res;
5331 }
5332
5334 getScheduleCopyableDataUsers(const Instruction *User) const {
5335 if (ScheduleCopyableDataMapByUsers.empty())
5336 return {};
5337 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5338 if (It == ScheduleCopyableDataMapByUsers.end())
5339 return {};
5341 for (ScheduleCopyableData *SD : It->getSecond()) {
5342 if (isInSchedulingRegion(*SD))
5343 Res.push_back(SD);
5344 }
5345 return Res;
5346 }
5347
5348 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5349 Instruction *I,
5350 int SchedulingRegionID,
5351 ScheduleBundle &Bundle) {
5352 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5353 ScheduleCopyableData *CD =
5354 ScheduleCopyableDataMap
5355 .try_emplace(std::make_pair(EI, I),
5356 std::make_unique<ScheduleCopyableData>(
5357 SchedulingRegionID, I, EI, Bundle))
5358 .first->getSecond()
5359 .get();
5360 ScheduleCopyableDataMapByInst[I].push_back(CD);
5361 if (EI.UserTE) {
5362 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5363 const auto *It = find(Op, I);
5364 assert(It != Op.end() && "Lane not set");
5365 SmallPtrSet<Instruction *, 4> Visited;
5366 do {
5367 int Lane = std::distance(Op.begin(), It);
5368 assert(Lane >= 0 && "Lane not set");
5369 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5370 !EI.UserTE->ReorderIndices.empty())
5371 Lane = EI.UserTE->ReorderIndices[Lane];
5372 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5373 "Couldn't find extract lane");
5374 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5375 if (!Visited.insert(In).second) {
5376 It = find(make_range(std::next(It), Op.end()), I);
5377 continue;
5378 }
5379 ScheduleCopyableDataMapByInstUser
5380 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5381 .first->getSecond()
5382 .push_back(CD);
5383 ScheduleCopyableDataMapByUsers.try_emplace(I)
5384 .first->getSecond()
5385 .insert(CD);
5386 // Remove extra deps for users, becoming non-immediate users of the
5387 // instruction. It may happen, if the chain of same copyable elements
5388 // appears in the tree.
5389 if (In == I) {
5390 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5391 if (ScheduleCopyableData *UserCD =
5392 getScheduleCopyableData(UserEI, In))
5393 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5394 }
5395 It = find(make_range(std::next(It), Op.end()), I);
5396 } while (It != Op.end());
5397 } else {
5398 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5399 CD);
5400 }
5401 return *CD;
5402 }
5403
5404 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5405 auto *I = dyn_cast<Instruction>(V);
5406 if (!I)
5407 return {};
5408 auto It = ScheduledBundles.find(I);
5409 if (It == ScheduledBundles.end())
5410 return {};
5411 return It->getSecond();
5412 }
5413
5414 /// Returns true if the entity is in the scheduling region.
5415 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5416 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5417 return Data->getSchedulingRegionID() == SchedulingRegionID;
5418 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5419 return CD->getSchedulingRegionID() == SchedulingRegionID;
5420 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5421 [&](const ScheduleEntity *BundleMember) {
5422 return isInSchedulingRegion(*BundleMember);
5423 });
5424 }
5425
5426 /// Marks an instruction as scheduled and puts all dependent ready
5427 /// instructions into the ready-list.
5428 template <typename ReadyListType>
5429 void schedule(const BoUpSLP &R, const InstructionsState &S,
5430 const EdgeInfo &EI, ScheduleEntity *Data,
5431 ReadyListType &ReadyList) {
5432 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5434 // Handle the def-use chain dependencies.
5435
5436 // Decrement the unscheduled counter and insert to ready list if ready.
5437 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5438 if ((IsControl || Data->hasValidDependencies()) &&
5439 Data->incrementUnscheduledDeps(-1) == 0) {
5440 // There are no more unscheduled dependencies after
5441 // decrementing, so we can put the dependent instruction
5442 // into the ready list.
5443 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5445 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5446 CopyableBundle.push_back(&CD->getBundle());
5447 Bundles = CopyableBundle;
5448 } else {
5449 Bundles = getScheduleBundles(Data->getInst());
5450 }
5451 if (!Bundles.empty()) {
5452 for (ScheduleBundle *Bundle : Bundles) {
5453 if (Bundle->unscheduledDepsInBundle() == 0) {
5454 assert(!Bundle->isScheduled() &&
5455 "already scheduled bundle gets ready");
5456 ReadyList.insert(Bundle);
5458 << "SLP: gets ready: " << *Bundle << "\n");
5459 }
5460 }
5461 return;
5462 }
5463 assert(!Data->isScheduled() &&
5464 "already scheduled bundle gets ready");
5466 "Expected non-copyable data");
5467 ReadyList.insert(Data);
5468 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5469 }
5470 };
5471
5472 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5473 Instruction *I) {
5474 if (!ScheduleCopyableDataMap.empty()) {
5476 getScheduleCopyableData(User, OpIdx, I);
5477 for (ScheduleCopyableData *CD : CopyableData)
5478 DecrUnsched(CD, /*IsControl=*/false);
5479 if (!CopyableData.empty())
5480 return;
5481 }
5482 if (ScheduleData *OpSD = getScheduleData(I))
5483 DecrUnsched(OpSD, /*IsControl=*/false);
5484 };
5485
5486 // If BundleMember is a vector bundle, its operands may have been
5487 // reordered during buildTree(). We therefore need to get its operands
5488 // through the TreeEntry.
5489 if (!Bundles.empty()) {
5490 auto *In = BundleMember->getInst();
5491 // Count uses of each instruction operand.
5492 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5493 unsigned TotalOpCount = 0;
5494 if (isa<ScheduleCopyableData>(BundleMember)) {
5495 // Copyable data is used only once (uses itself).
5496 TotalOpCount = OperandsUses[In] = 1;
5497 } else {
5498 for (const Use &U : In->operands()) {
5499 if (auto *I = dyn_cast<Instruction>(U.get())) {
5500 auto Res = OperandsUses.try_emplace(I, 0);
5501 ++Res.first->getSecond();
5502 ++TotalOpCount;
5503 }
5504 }
5505 }
5506 // Decrement the unscheduled counter and insert to ready list if
5507 // ready.
5508 auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
5509 unsigned OpIdx) {
5510 if (!ScheduleCopyableDataMap.empty()) {
5511 const EdgeInfo EI = {UserTE, OpIdx};
5512 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
5513 DecrUnsched(CD, /*IsControl=*/false);
5514 return;
5515 }
5516 }
5517 auto It = OperandsUses.find(I);
5518 assert(It != OperandsUses.end() && "Operand not found");
5519 if (It->second > 0) {
5520 --It->getSecond();
5521 assert(TotalOpCount > 0 && "No more operands to decrement");
5522 --TotalOpCount;
5523 if (ScheduleData *OpSD = getScheduleData(I))
5524 DecrUnsched(OpSD, /*IsControl=*/false);
5525 }
5526 };
5527
5528 for (ScheduleBundle *Bundle : Bundles) {
5529 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5530 break;
5531 // Need to search for the lane since the tree entry can be
5532 // reordered.
5533 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5534 find(Bundle->getTreeEntry()->Scalars, In));
5535 assert(Lane >= 0 && "Lane not set");
5536 if (isa<StoreInst>(In) &&
5537 !Bundle->getTreeEntry()->ReorderIndices.empty())
5538 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5539 assert(Lane < static_cast<int>(
5540 Bundle->getTreeEntry()->Scalars.size()) &&
5541 "Couldn't find extract lane");
5542
5543 // Since vectorization tree is being built recursively this
5544 // assertion ensures that the tree entry has all operands set before
5545 // reaching this code. Couple of exceptions known at the moment are
5546 // extracts where their second (immediate) operand is not added.
5547 // Since immediates do not affect scheduler behavior this is
5548 // considered okay.
5549 assert(In &&
5551 In->getNumOperands() ==
5552 Bundle->getTreeEntry()->getNumOperands() ||
5553 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5554 "Missed TreeEntry operands?");
5555
5556 for (unsigned OpIdx :
5557 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5558 if (auto *I = dyn_cast<Instruction>(
5559 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5560 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I
5561 << "\n");
5562 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
5563 }
5564 }
5565 } else {
5566 // If BundleMember is a stand-alone instruction, no operand reordering
5567 // has taken place, so we directly access its operands.
5568 for (Use &U : BundleMember->getInst()->operands()) {
5569 if (auto *I = dyn_cast<Instruction>(U.get())) {
5571 << "SLP: check for readiness (def): " << *I << "\n");
5572 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5573 }
5574 }
5575 }
5576 // Handle the memory dependencies.
5577 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5578 if (!SD)
5579 return;
5580 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5581 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5582 if (!VisitedMemory.insert(MemoryDep).second)
5583 continue;
5584 // There are no more unscheduled dependencies after decrementing,
5585 // so we can put the dependent instruction into the ready list.
5586 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5587 << *MemoryDep << "\n");
5588 DecrUnsched(MemoryDep);
5589 }
5590 // Handle the control dependencies.
5591 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5592 for (ScheduleData *Dep : SD->getControlDependencies()) {
5593 if (!VisitedControl.insert(Dep).second)
5594 continue;
5595 // There are no more unscheduled dependencies after decrementing,
5596 // so we can put the dependent instruction into the ready list.
5598 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5599 DecrUnsched(Dep, /*IsControl=*/true);
5600 }
5601 };
5602 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5603 SD->setScheduled(/*Scheduled=*/true);
5604 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5607 Instruction *In = SD->getInst();
5608 if (R.isVectorized(In)) {
5609 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5610 for (TreeEntry *TE : Entries) {
5612 In->getNumOperands() != TE->getNumOperands())
5613 continue;
5614 auto &BundlePtr =
5615 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5616 BundlePtr->setTreeEntry(TE);
5617 BundlePtr->add(SD);
5618 Bundles.push_back(BundlePtr.get());
5619 }
5620 }
5621 ProcessBundleMember(SD, Bundles);
5622 } else {
5623 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5624 Bundle.setScheduled(/*Scheduled=*/true);
5625 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5626 auto AreAllBundlesScheduled =
5627 [&](const ScheduleEntity *SD,
5628 ArrayRef<ScheduleBundle *> SDBundles) {
5630 return true;
5631 return !SDBundles.empty() &&
5632 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5633 return SDBundle->isScheduled();
5634 });
5635 };
5636 for (ScheduleEntity *SD : Bundle.getBundle()) {
5639 SDBundles = getScheduleBundles(SD->getInst());
5640 if (AreAllBundlesScheduled(SD, SDBundles)) {
5641 SD->setScheduled(/*Scheduled=*/true);
5642 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5643 : SDBundles);
5644 }
5645 }
5646 }
5647 }
5648
5649 /// Verify basic self consistency properties of the data structure.
5650 void verify() {
5651 if (!ScheduleStart)
5652 return;
5653
5654 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5655 ScheduleStart->comesBefore(ScheduleEnd) &&
5656 "Not a valid scheduling region?");
5657
5658 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5659 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5660 if (!Bundles.empty()) {
5661 for (ScheduleBundle *Bundle : Bundles) {
5662 assert(isInSchedulingRegion(*Bundle) &&
5663 "primary schedule data not in window?");
5664 Bundle->verify();
5665 }
5666 continue;
5667 }
5668 auto *SD = getScheduleData(I);
5669 if (!SD)
5670 continue;
5671 assert(isInSchedulingRegion(*SD) &&
5672 "primary schedule data not in window?");
5673 SD->verify();
5674 }
5675
5676 assert(all_of(ReadyInsts,
5677 [](const ScheduleEntity *Bundle) {
5678 return Bundle->isReady();
5679 }) &&
5680 "item in ready list not ready?");
5681 }
5682
5683 /// Put all instructions into the ReadyList which are ready for scheduling.
5684 template <typename ReadyListType>
5685 void initialFillReadyList(ReadyListType &ReadyList) {
5686 SmallPtrSet<ScheduleBundle *, 16> Visited;
5687 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5688 ScheduleData *SD = getScheduleData(I);
5689 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5690 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5691 !Bundles.empty()) {
5692 for (ScheduleBundle *Bundle : Bundles) {
5693 if (!Visited.insert(Bundle).second)
5694 continue;
5695 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5696 ReadyList.insert(Bundle);
5697 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5698 << *Bundle << "\n");
5699 }
5700 }
5701 continue;
5702 }
5703 ReadyList.insert(SD);
5705 << "SLP: initially in ready list: " << *SD << "\n");
5706 }
5707 }
5708 }
5709
5710 /// Build a bundle from the ScheduleData nodes corresponding to the
5711 /// scalar instruction for each lane.
5712 /// \param VL The list of scalar instructions.
5713 /// \param S The state of the instructions.
5714 /// \param EI The edge in the SLP graph or the user node/operand number.
5715 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5716 const InstructionsState &S, const EdgeInfo &EI);
5717
5718 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5719 /// cyclic dependencies. This is only a dry-run, no instructions are
5720 /// actually moved at this stage.
5721 /// \returns the scheduling bundle. The returned Optional value is not
5722 /// std::nullopt if \p VL is allowed to be scheduled.
5723 std::optional<ScheduleBundle *>
5724 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5725 const InstructionsState &S, const EdgeInfo &EI);
5726
5727 /// Allocates schedule data chunk.
5728 ScheduleData *allocateScheduleDataChunks();
5729
5730 /// Extends the scheduling region so that V is inside the region.
5731 /// \returns true if the region size is within the limit.
5732 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5733
5734 /// Initialize the ScheduleData structures for new instructions in the
5735 /// scheduling region.
5736 void initScheduleData(Instruction *FromI, Instruction *ToI,
5737 ScheduleData *PrevLoadStore,
5738 ScheduleData *NextLoadStore);
5739
5740 /// Updates the dependency information of a bundle and of all instructions/
5741 /// bundles which depend on the original bundle.
5742 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5743 BoUpSLP *SLP,
5744 ArrayRef<ScheduleData *> ControlDeps = {});
5745
5746 /// Sets all instruction in the scheduling region to un-scheduled.
5747 void resetSchedule();
5748
5749 BasicBlock *BB;
5750
5751 /// Simple memory allocation for ScheduleData.
5753
5754 /// The size of a ScheduleData array in ScheduleDataChunks.
5755 int ChunkSize;
5756
5757 /// The allocator position in the current chunk, which is the last entry
5758 /// of ScheduleDataChunks.
5759 int ChunkPos;
5760
5761 /// Attaches ScheduleData to Instruction.
5762 /// Note that the mapping survives during all vectorization iterations, i.e.
5763 /// ScheduleData structures are recycled.
5764 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5765
5766 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5767 /// number) and the operand instruction, represented as copyable element.
5768 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5769 std::unique_ptr<ScheduleCopyableData>>
5770 ScheduleCopyableDataMap;
5771
5772 /// Represents mapping between instruction and all related
5773 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5774 /// element). The SLP tree may contain several representations of the same
5775 /// instruction.
5776 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5777 ScheduleCopyableDataMapByInst;
5778
5779 /// Represents mapping between user value and operand number, the operand
5780 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5781 /// the same user may refernce the same operand in different tree entries
5782 /// and the operand may be modelled by the different copyable data element.
5783 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5785 ScheduleCopyableDataMapByInstUser;
5786
5787 /// Represents mapping between instruction and all related
5788 /// ScheduleCopyableData. It represents the mapping between the actual
5789 /// instruction and the last copyable data element in the chain. E.g., if
5790 /// the graph models the following instructions:
5791 /// %0 = non-add instruction ...
5792 /// ...
5793 /// %4 = add %3, 1
5794 /// %5 = add %4, 1
5795 /// %6 = insertelement poison, %0, 0
5796 /// %7 = insertelement %6, %5, 1
5797 /// And the graph is modeled as:
5798 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
5799 /// -> [1, 0] -> [%1, 0]
5800 ///
5801 /// this map will map %0 only to the copyable element <1>, which is the last
5802 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
5803 /// keep the map to <0>, not the %0.
5804 SmallDenseMap<const Instruction *,
5805 SmallSetVector<ScheduleCopyableData *, 4>>
5806 ScheduleCopyableDataMapByUsers;
5807
5808 /// Attaches ScheduleBundle to Instruction.
5809 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5810 ScheduledBundles;
5811 /// The list of ScheduleBundles.
5812 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5813
5814 /// The ready-list for scheduling (only used for the dry-run).
5815 SetVector<ScheduleEntity *> ReadyInsts;
5816
5817 /// The first instruction of the scheduling region.
5818 Instruction *ScheduleStart = nullptr;
5819
5820 /// The first instruction _after_ the scheduling region.
5821 Instruction *ScheduleEnd = nullptr;
5822
5823 /// The first memory accessing instruction in the scheduling region
5824 /// (can be null).
5825 ScheduleData *FirstLoadStoreInRegion = nullptr;
5826
5827 /// The last memory accessing instruction in the scheduling region
5828 /// (can be null).
5829 ScheduleData *LastLoadStoreInRegion = nullptr;
5830
5831 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5832 /// region? Used to optimize the dependence calculation for the
5833 /// common case where there isn't.
5834 bool RegionHasStackSave = false;
5835
5836 /// The current size of the scheduling region.
5837 int ScheduleRegionSize = 0;
5838
5839 /// The maximum size allowed for the scheduling region.
5840 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
5841
5842 /// The ID of the scheduling region. For a new vectorization iteration this
5843 /// is incremented which "removes" all ScheduleData from the region.
5844 /// Make sure that the initial SchedulingRegionID is greater than the
5845 /// initial SchedulingRegionID in ScheduleData (which is 0).
5846 int SchedulingRegionID = 1;
5847 };
5848
5849 /// Attaches the BlockScheduling structures to basic blocks.
5850 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5851
5852 /// Performs the "real" scheduling. Done before vectorization is actually
5853 /// performed in a basic block.
5854 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
5855
5856 /// List of users to ignore during scheduling and that don't need extracting.
5857 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
5858
5859 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
5860 /// sorted SmallVectors of unsigned.
5861 struct OrdersTypeDenseMapInfo {
5862 static OrdersType getEmptyKey() {
5863 OrdersType V;
5864 V.push_back(~1U);
5865 return V;
5866 }
5867
5868 static OrdersType getTombstoneKey() {
5869 OrdersType V;
5870 V.push_back(~2U);
5871 return V;
5872 }
5873
5874 static unsigned getHashValue(const OrdersType &V) {
5875 return static_cast<unsigned>(hash_combine_range(V));
5876 }
5877
5878 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
5879 return LHS == RHS;
5880 }
5881 };
5882
5883 // Analysis and block reference.
5884 Function *F;
5885 ScalarEvolution *SE;
5886 TargetTransformInfo *TTI;
5887 TargetLibraryInfo *TLI;
5888 LoopInfo *LI;
5889 DominatorTree *DT;
5890 AssumptionCache *AC;
5891 DemandedBits *DB;
5892 const DataLayout *DL;
5893 OptimizationRemarkEmitter *ORE;
5894
5895 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
5896 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
5897
5898 /// Instruction builder to construct the vectorized tree.
5899 IRBuilder<TargetFolder> Builder;
5900
5901 /// A map of scalar integer values to the smallest bit width with which they
5902 /// can legally be represented. The values map to (width, signed) pairs,
5903 /// where "width" indicates the minimum bit width and "signed" is True if the
5904 /// value must be signed-extended, rather than zero-extended, back to its
5905 /// original width.
5906 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
5907
5908 /// Final size of the reduced vector, if the current graph represents the
5909 /// input for the reduction and it was possible to narrow the size of the
5910 /// reduction.
5911 unsigned ReductionBitWidth = 0;
5912
5913 /// Canonical graph size before the transformations.
5914 unsigned BaseGraphSize = 1;
5915
5916 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
5917 /// type sizes, used in the tree.
5918 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5919
5920 /// Indices of the vectorized nodes, which supposed to be the roots of the new
5921 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
5922 DenseSet<unsigned> ExtraBitWidthNodes;
5923};
5924
5925} // end namespace slpvectorizer
5926
5927template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {
5931 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
5932 SecondInfo::getEmptyKey());
5933 }
5934
5936 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
5937 SecondInfo::getTombstoneKey());
5938 }
5939
5940 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
5941 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
5942 SecondInfo::getHashValue(Val.EdgeIdx));
5943 }
5944
5945 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
5946 const BoUpSLP::EdgeInfo &RHS) {
5947 return LHS == RHS;
5948 }
5949};
5950
5951template <> struct GraphTraits<BoUpSLP *> {
5952 using TreeEntry = BoUpSLP::TreeEntry;
5953
5954 /// NodeRef has to be a pointer per the GraphWriter.
5956
5957 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
5958
5959 /// Add the VectorizableTree to the index iterator to be able to return
5960 /// TreeEntry pointers.
5962 : public iterator_adaptor_base<
5963 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5965
5969
5970 NodeRef operator*() { return I->UserTE; }
5971 };
5972
5974 return R.VectorizableTree[0].get();
5975 }
5976
5978 return {&N->UserTreeIndex, N->Container};
5979 }
5980
5982 return {&N->UserTreeIndex + 1, N->Container};
5983 }
5984
5985 /// For the node iterator we just need to turn the TreeEntry iterator into a
5986 /// TreeEntry* iterator so that it dereferences to NodeRef.
5988 using ItTy = ContainerTy::iterator;
5989 ItTy It;
5990
5991 public:
5992 nodes_iterator(const ItTy &It2) : It(It2) {}
5993 NodeRef operator*() { return It->get(); }
5995 ++It;
5996 return *this;
5997 }
5998 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
5999 };
6000
6002 return nodes_iterator(R->VectorizableTree.begin());
6003 }
6004
6006 return nodes_iterator(R->VectorizableTree.end());
6007 }
6008
6009 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6010};
6011
6012template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
6013 using TreeEntry = BoUpSLP::TreeEntry;
6014
6015 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6016
6017 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6018 std::string Str;
6019 raw_string_ostream OS(Str);
6020 OS << Entry->Idx << ".\n";
6021 if (isSplat(Entry->Scalars))
6022 OS << "<splat> ";
6023 for (auto *V : Entry->Scalars) {
6024 OS << *V;
6025 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6026 return EU.Scalar == V;
6027 }))
6028 OS << " <extract>";
6029 OS << "\n";
6030 }
6031 return Str;
6032 }
6033
6034 static std::string getNodeAttributes(const TreeEntry *Entry,
6035 const BoUpSLP *) {
6036 if (Entry->isGather())
6037 return "color=red";
6038 if (Entry->State == TreeEntry::ScatterVectorize ||
6039 Entry->State == TreeEntry::StridedVectorize ||
6040 Entry->State == TreeEntry::CompressVectorize)
6041 return "color=blue";
6042 return "";
6043 }
6044};
6045
6046} // end namespace llvm
6047
6050 for (auto *I : DeletedInstructions) {
6051 if (!I->getParent()) {
6052 // Temporarily insert instruction back to erase them from parent and
6053 // memory later.
6054 if (isa<PHINode>(I))
6055 // Phi nodes must be the very first instructions in the block.
6056 I->insertBefore(F->getEntryBlock(),
6057 F->getEntryBlock().getFirstNonPHIIt());
6058 else
6059 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6060 continue;
6061 }
6062 for (Use &U : I->operands()) {
6063 auto *Op = dyn_cast<Instruction>(U.get());
6064 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6066 DeadInsts.emplace_back(Op);
6067 }
6068 I->dropAllReferences();
6069 }
6070 for (auto *I : DeletedInstructions) {
6071 assert(I->use_empty() &&
6072 "trying to erase instruction with users.");
6073 I->eraseFromParent();
6074 }
6075
6076 // Cleanup any dead scalar code feeding the vectorized instructions
6078
6079#ifdef EXPENSIVE_CHECKS
6080 // If we could guarantee that this call is not extremely slow, we could
6081 // remove the ifdef limitation (see PR47712).
6082 assert(!verifyFunction(*F, &dbgs()));
6083#endif
6084}
6085
6086/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6087/// contains original mask for the scalars reused in the node. Procedure
6088/// transform this mask in accordance with the given \p Mask.
6090 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6091 "Expected non-empty mask.");
6092 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6093 Prev.swap(Reuses);
6094 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6095 if (Mask[I] != PoisonMaskElem)
6096 Reuses[Mask[I]] = Prev[I];
6097}
6098
6099/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6100/// the original order of the scalars. Procedure transforms the provided order
6101/// in accordance with the given \p Mask. If the resulting \p Order is just an
6102/// identity order, \p Order is cleared.
6104 bool BottomOrder = false) {
6105 assert(!Mask.empty() && "Expected non-empty mask.");
6106 unsigned Sz = Mask.size();
6107 if (BottomOrder) {
6108 SmallVector<unsigned> PrevOrder;
6109 if (Order.empty()) {
6110 PrevOrder.resize(Sz);
6111 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6112 } else {
6113 PrevOrder.swap(Order);
6114 }
6115 Order.assign(Sz, Sz);
6116 for (unsigned I = 0; I < Sz; ++I)
6117 if (Mask[I] != PoisonMaskElem)
6118 Order[I] = PrevOrder[Mask[I]];
6119 if (all_of(enumerate(Order), [&](const auto &Data) {
6120 return Data.value() == Sz || Data.index() == Data.value();
6121 })) {
6122 Order.clear();
6123 return;
6124 }
6125 fixupOrderingIndices(Order);
6126 return;
6127 }
6128 SmallVector<int> MaskOrder;
6129 if (Order.empty()) {
6130 MaskOrder.resize(Sz);
6131 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6132 } else {
6133 inversePermutation(Order, MaskOrder);
6134 }
6135 reorderReuses(MaskOrder, Mask);
6136 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6137 Order.clear();
6138 return;
6139 }
6140 Order.assign(Sz, Sz);
6141 for (unsigned I = 0; I < Sz; ++I)
6142 if (MaskOrder[I] != PoisonMaskElem)
6143 Order[MaskOrder[I]] = I;
6144 fixupOrderingIndices(Order);
6145}
6146
6147std::optional<BoUpSLP::OrdersType>
6148BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6149 bool TopToBottom, bool IgnoreReorder) {
6150 assert(TE.isGather() && "Expected gather node only.");
6151 // Try to find subvector extract/insert patterns and reorder only such
6152 // patterns.
6153 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6154 Type *ScalarTy = GatheredScalars.front()->getType();
6155 size_t NumScalars = GatheredScalars.size();
6156 if (!isValidElementType(ScalarTy))
6157 return std::nullopt;
6158 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6159 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6160 SmallVector<int> ExtractMask;
6161 SmallVector<int> Mask;
6164 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6166 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6167 /*ForOrder=*/true);
6168 // No shuffled operands - ignore.
6169 if (GatherShuffles.empty() && ExtractShuffles.empty())
6170 return std::nullopt;
6171 OrdersType CurrentOrder(NumScalars, NumScalars);
6172 if (GatherShuffles.size() == 1 &&
6173 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6174 Entries.front().front()->isSame(TE.Scalars)) {
6175 // If the full matched node in whole tree rotation - no need to consider the
6176 // matching order, rotating the whole tree.
6177 if (TopToBottom)
6178 return std::nullopt;
6179 // No need to keep the order for the same user node.
6180 if (Entries.front().front()->UserTreeIndex.UserTE ==
6181 TE.UserTreeIndex.UserTE)
6182 return std::nullopt;
6183 // No need to keep the order for the matched root node, if it can be freely
6184 // reordered.
6185 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6186 return std::nullopt;
6187 // If shuffling 2 elements only and the matching node has reverse reuses -
6188 // no need to count order, both work fine.
6189 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6190 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6191 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6192 [](const auto &P) {
6193 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6194 }))
6195 return std::nullopt;
6196
6197 // Perfect match in the graph, will reuse the previously vectorized
6198 // node. Cost is 0.
6199 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6200 return CurrentOrder;
6201 }
6202 auto IsSplatMask = [](ArrayRef<int> Mask) {
6203 int SingleElt = PoisonMaskElem;
6204 return all_of(Mask, [&](int I) {
6205 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6206 SingleElt = I;
6207 return I == PoisonMaskElem || I == SingleElt;
6208 });
6209 };
6210 // Exclusive broadcast mask - ignore.
6211 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6212 (Entries.size() != 1 ||
6213 Entries.front().front()->ReorderIndices.empty())) ||
6214 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6215 return std::nullopt;
6216 SmallBitVector ShuffledSubMasks(NumParts);
6217 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6218 ArrayRef<int> Mask, int PartSz, int NumParts,
6219 function_ref<unsigned(unsigned)> GetVF) {
6220 for (int I : seq<int>(0, NumParts)) {
6221 if (ShuffledSubMasks.test(I))
6222 continue;
6223 const int VF = GetVF(I);
6224 if (VF == 0)
6225 continue;
6226 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6227 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6228 // Shuffle of at least 2 vectors - ignore.
6229 if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {
6230 llvm::fill(Slice, NumScalars);
6231 ShuffledSubMasks.set(I);
6232 continue;
6233 }
6234 // Try to include as much elements from the mask as possible.
6235 int FirstMin = INT_MAX;
6236 int SecondVecFound = false;
6237 for (int K : seq<int>(Limit)) {
6238 int Idx = Mask[I * PartSz + K];
6239 if (Idx == PoisonMaskElem) {
6240 Value *V = GatheredScalars[I * PartSz + K];
6241 if (isConstant(V) && !isa<PoisonValue>(V)) {
6242 SecondVecFound = true;
6243 break;
6244 }
6245 continue;
6246 }
6247 if (Idx < VF) {
6248 if (FirstMin > Idx)
6249 FirstMin = Idx;
6250 } else {
6251 SecondVecFound = true;
6252 break;
6253 }
6254 }
6255 FirstMin = (FirstMin / PartSz) * PartSz;
6256 // Shuffle of at least 2 vectors - ignore.
6257 if (SecondVecFound) {
6258 llvm::fill(Slice, NumScalars);
6259 ShuffledSubMasks.set(I);
6260 continue;
6261 }
6262 for (int K : seq<int>(Limit)) {
6263 int Idx = Mask[I * PartSz + K];
6264 if (Idx == PoisonMaskElem)
6265 continue;
6266 Idx -= FirstMin;
6267 if (Idx >= PartSz) {
6268 SecondVecFound = true;
6269 break;
6270 }
6271 if (CurrentOrder[I * PartSz + Idx] >
6272 static_cast<unsigned>(I * PartSz + K) &&
6273 CurrentOrder[I * PartSz + Idx] !=
6274 static_cast<unsigned>(I * PartSz + Idx))
6275 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6276 }
6277 // Shuffle of at least 2 vectors - ignore.
6278 if (SecondVecFound) {
6279 llvm::fill(Slice, NumScalars);
6280 ShuffledSubMasks.set(I);
6281 continue;
6282 }
6283 }
6284 };
6285 int PartSz = getPartNumElems(NumScalars, NumParts);
6286 if (!ExtractShuffles.empty())
6287 TransformMaskToOrder(
6288 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6289 if (!ExtractShuffles[I])
6290 return 0U;
6291 unsigned VF = 0;
6292 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6293 for (unsigned Idx : seq<unsigned>(Sz)) {
6294 int K = I * PartSz + Idx;
6295 if (ExtractMask[K] == PoisonMaskElem)
6296 continue;
6297 if (!TE.ReuseShuffleIndices.empty())
6298 K = TE.ReuseShuffleIndices[K];
6299 if (K == PoisonMaskElem)
6300 continue;
6301 if (!TE.ReorderIndices.empty())
6302 K = std::distance(TE.ReorderIndices.begin(),
6303 find(TE.ReorderIndices, K));
6304 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6305 if (!EI)
6306 continue;
6307 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6308 ->getElementCount()
6309 .getKnownMinValue());
6310 }
6311 return VF;
6312 });
6313 // Check special corner case - single shuffle of the same entry.
6314 if (GatherShuffles.size() == 1 && NumParts != 1) {
6315 if (ShuffledSubMasks.any())
6316 return std::nullopt;
6317 PartSz = NumScalars;
6318 NumParts = 1;
6319 }
6320 if (!Entries.empty())
6321 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6322 if (!GatherShuffles[I])
6323 return 0U;
6324 return std::max(Entries[I].front()->getVectorFactor(),
6325 Entries[I].back()->getVectorFactor());
6326 });
6327 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6328 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6329 return std::nullopt;
6330 return std::move(CurrentOrder);
6331}
6332
6333static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6334 const TargetLibraryInfo &TLI,
6335 bool CompareOpcodes = true) {
6338 return false;
6339 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6340 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6341 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6342 (!GEP2 || GEP2->getNumOperands() == 2) &&
6343 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6344 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6345 !CompareOpcodes ||
6346 (GEP1 && GEP2 &&
6347 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6348}
6349
6350/// Calculates minimal alignment as a common alignment.
6351template <typename T>
6353 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6354 for (Value *V : VL)
6355 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6356 return CommonAlignment;
6357}
6358
6359/// Check if \p Order represents reverse order.
6361 assert(!Order.empty() &&
6362 "Order is empty. Please check it before using isReverseOrder.");
6363 unsigned Sz = Order.size();
6364 return all_of(enumerate(Order), [&](const auto &Pair) {
6365 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6366 });
6367}
6368
6369/// Checks if the provided list of pointers \p Pointers represents the strided
6370/// pointers for type ElemTy. If they are not, nullptr is returned.
6371/// Otherwise, SCEV* of the stride value is returned.
6372static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6373 const DataLayout &DL, ScalarEvolution &SE,
6374 SmallVectorImpl<unsigned> &SortedIndices) {
6376 const SCEV *PtrSCEVLowest = nullptr;
6377 const SCEV *PtrSCEVHighest = nullptr;
6378 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6379 // addresses).
6380 for (Value *Ptr : PointerOps) {
6381 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6382 if (!PtrSCEV)
6383 return nullptr;
6384 SCEVs.push_back(PtrSCEV);
6385 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6386 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6387 continue;
6388 }
6389 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6390 if (isa<SCEVCouldNotCompute>(Diff))
6391 return nullptr;
6392 if (Diff->isNonConstantNegative()) {
6393 PtrSCEVLowest = PtrSCEV;
6394 continue;
6395 }
6396 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6397 if (isa<SCEVCouldNotCompute>(Diff1))
6398 return nullptr;
6399 if (Diff1->isNonConstantNegative()) {
6400 PtrSCEVHighest = PtrSCEV;
6401 continue;
6402 }
6403 }
6404 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6405 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6406 if (isa<SCEVCouldNotCompute>(Dist))
6407 return nullptr;
6408 int Size = DL.getTypeStoreSize(ElemTy);
6409 auto TryGetStride = [&](const SCEV *Dist,
6410 const SCEV *Multiplier) -> const SCEV * {
6411 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6412 if (M->getOperand(0) == Multiplier)
6413 return M->getOperand(1);
6414 if (M->getOperand(1) == Multiplier)
6415 return M->getOperand(0);
6416 return nullptr;
6417 }
6418 if (Multiplier == Dist)
6419 return SE.getConstant(Dist->getType(), 1);
6420 return SE.getUDivExactExpr(Dist, Multiplier);
6421 };
6422 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6423 const SCEV *Stride = nullptr;
6424 if (Size != 1 || SCEVs.size() > 2) {
6425 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6426 Stride = TryGetStride(Dist, Sz);
6427 if (!Stride)
6428 return nullptr;
6429 }
6430 if (!Stride || isa<SCEVConstant>(Stride))
6431 return nullptr;
6432 // Iterate through all pointers and check if all distances are
6433 // unique multiple of Stride.
6434 using DistOrdPair = std::pair<int64_t, int>;
6435 auto Compare = llvm::less_first();
6436 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6437 int Cnt = 0;
6438 bool IsConsecutive = true;
6439 for (const SCEV *PtrSCEV : SCEVs) {
6440 unsigned Dist = 0;
6441 if (PtrSCEV != PtrSCEVLowest) {
6442 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6443 const SCEV *Coeff = TryGetStride(Diff, Stride);
6444 if (!Coeff)
6445 return nullptr;
6446 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6447 if (!SC || isa<SCEVCouldNotCompute>(SC))
6448 return nullptr;
6449 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6450 SE.getMulExpr(Stride, SC)))
6451 ->isZero())
6452 return nullptr;
6453 Dist = SC->getAPInt().getZExtValue();
6454 }
6455 // If the strides are not the same or repeated, we can't vectorize.
6456 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6457 return nullptr;
6458 auto Res = Offsets.emplace(Dist, Cnt);
6459 if (!Res.second)
6460 return nullptr;
6461 // Consecutive order if the inserted element is the last one.
6462 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6463 ++Cnt;
6464 }
6465 if (Offsets.size() != SCEVs.size())
6466 return nullptr;
6467 SortedIndices.clear();
6468 if (!IsConsecutive) {
6469 // Fill SortedIndices array only if it is non-consecutive.
6470 SortedIndices.resize(PointerOps.size());
6471 Cnt = 0;
6472 for (const std::pair<int64_t, int> &Pair : Offsets) {
6473 SortedIndices[Cnt] = Pair.second;
6474 ++Cnt;
6475 }
6476 }
6477 return Stride;
6478}
6479
6480static std::pair<InstructionCost, InstructionCost>
6481getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
6482 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6483 Type *ScalarTy, VectorType *VecTy);
6484
6485/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6486/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6487/// subvector pattern.
6488static InstructionCost
6490 VectorType *Tp, ArrayRef<int> Mask = {},
6492 int Index = 0, VectorType *SubTp = nullptr,
6494 VectorType *DstTy = Tp;
6495 if (!Mask.empty())
6496 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6497
6498 if (Kind != TTI::SK_PermuteTwoSrc)
6499 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6500 Args);
6501 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6502 int NumSubElts;
6504 Mask, NumSrcElts, NumSubElts, Index)) {
6505 if (Index + NumSubElts > NumSrcElts &&
6506 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6507 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6508 TTI::TCK_RecipThroughput, Index, Tp);
6509 }
6510 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6511 Args);
6512}
6513
6514/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6515/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6516/// instead of a scalar.
6517static InstructionCost
6519 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6520 bool Extract, TTI::TargetCostKind CostKind,
6521 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6523 "ScalableVectorType is not supported.");
6524 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6525 getNumElements(Ty) &&
6526 "Incorrect usage.");
6527 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6528 assert(SLPReVec && "Only supported by REVEC.");
6529 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6530 // of CreateInsertElement.
6531 unsigned ScalarTyNumElements = VecTy->getNumElements();
6533 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6534 if (!DemandedElts[I])
6535 continue;
6536 if (Insert)
6538 I * ScalarTyNumElements, VecTy);
6539 if (Extract)
6541 I * ScalarTyNumElements, VecTy);
6542 }
6543 return Cost;
6544 }
6545 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6546 CostKind, ForPoisonSrc, VL);
6547}
6548
6549/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6550/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6552 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6553 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6554 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6555 if (Opcode == Instruction::ExtractElement) {
6556 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6557 assert(SLPReVec && "Only supported by REVEC.");
6558 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6560 cast<VectorType>(Val), {}, CostKind,
6561 Index * VecTy->getNumElements(), VecTy);
6562 }
6563 }
6564 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6565 ScalarUserAndIdx);
6566}
6567
6568/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6569/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6571 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6572 VectorType *VecTy, unsigned Index,
6574 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6575 assert(SLPReVec && "Only supported by REVEC.");
6576 auto *SubTp =
6577 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6579 Index * ScalarTy->getNumElements(), SubTp) +
6580 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6581 CostKind);
6582 }
6583 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6584}
6585
6586/// Creates subvector insert. Generates shuffle using \p Generator or
6587/// using default shuffle.
6589 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6590 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6591 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6592 return Vec;
6593 const unsigned SubVecVF = getNumElements(V->getType());
6594 // Create shuffle, insertvector requires that index is multiple of
6595 // the subvector length.
6596 const unsigned VecVF = getNumElements(Vec->getType());
6597 SmallVector<int> Mask(VecVF, PoisonMaskElem);
6598 if (isa<PoisonValue>(Vec)) {
6599 auto *Begin = std::next(Mask.begin(), Index);
6600 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6601 Vec = Builder.CreateShuffleVector(V, Mask);
6602 return Vec;
6603 }
6604 std::iota(Mask.begin(), Mask.end(), 0);
6605 std::iota(std::next(Mask.begin(), Index),
6606 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6607 if (Generator)
6608 return Generator(Vec, V, Mask);
6609 // 1. Resize V to the size of Vec.
6610 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6611 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6612 V = Builder.CreateShuffleVector(V, ResizeMask);
6613 // 2. Insert V into Vec.
6614 return Builder.CreateShuffleVector(Vec, V, Mask);
6615}
6616
6617/// Generates subvector extract using \p Generator or using default shuffle.
6619 unsigned SubVecVF, unsigned Index) {
6620 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6621 std::iota(Mask.begin(), Mask.end(), Index);
6622 return Builder.CreateShuffleVector(Vec, Mask);
6623}
6624
6625/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6626/// with \p Order.
6627/// \return true if the mask represents strided access, false - otherwise.
6629 ArrayRef<unsigned> Order, Type *ScalarTy,
6630 const DataLayout &DL, ScalarEvolution &SE,
6631 SmallVectorImpl<int> &CompressMask) {
6632 const unsigned Sz = PointerOps.size();
6633 CompressMask.assign(Sz, PoisonMaskElem);
6634 // The first element always set.
6635 CompressMask[0] = 0;
6636 // Check if the mask represents strided access.
6637 std::optional<unsigned> Stride = 0;
6638 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6639 for (unsigned I : seq<unsigned>(1, Sz)) {
6640 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6641 std::optional<int64_t> OptPos =
6642 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6643 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6644 return false;
6645 unsigned Pos = static_cast<unsigned>(*OptPos);
6646 CompressMask[I] = Pos;
6647 if (!Stride)
6648 continue;
6649 if (*Stride == 0) {
6650 *Stride = Pos;
6651 continue;
6652 }
6653 if (Pos != *Stride * I)
6654 Stride.reset();
6655 }
6656 return Stride.has_value();
6657}
6658
6659/// Checks if the \p VL can be transformed to a (masked)load + compress or
6660/// (masked) interleaved load.
6662 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6665 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6666 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6667 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6668 VectorType *&LoadVecTy) {
6669 InterleaveFactor = 0;
6670 Type *ScalarTy = VL.front()->getType();
6671 const size_t Sz = VL.size();
6672 auto *VecTy = getWidenedType(ScalarTy, Sz);
6674 SmallVector<int> Mask;
6675 if (!Order.empty())
6676 inversePermutation(Order, Mask);
6677 // Check external uses.
6678 for (const auto [I, V] : enumerate(VL)) {
6679 if (AreAllUsersVectorized(V))
6680 continue;
6681 InstructionCost ExtractCost =
6682 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6683 Mask.empty() ? I : Mask[I]);
6684 InstructionCost ScalarCost =
6685 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6686 if (ExtractCost <= ScalarCost)
6687 return false;
6688 }
6689 Value *Ptr0;
6690 Value *PtrN;
6691 if (Order.empty()) {
6692 Ptr0 = PointerOps.front();
6693 PtrN = PointerOps.back();
6694 } else {
6695 Ptr0 = PointerOps[Order.front()];
6696 PtrN = PointerOps[Order.back()];
6697 }
6698 std::optional<int64_t> Diff =
6699 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6700 if (!Diff)
6701 return false;
6702 const size_t MaxRegSize =
6704 .getFixedValue();
6705 // Check for very large distances between elements.
6706 if (*Diff / Sz >= MaxRegSize / 8)
6707 return false;
6708 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6709 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6710 Align CommonAlignment = LI->getAlign();
6711 IsMasked = !isSafeToLoadUnconditionally(
6712 Ptr0, LoadVecTy, CommonAlignment, DL,
6713 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6714 &TLI);
6715 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6716 LI->getPointerAddressSpace()))
6717 return false;
6718 // TODO: perform the analysis of each scalar load for better
6719 // safe-load-unconditionally analysis.
6720 bool IsStrided =
6721 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6722 assert(CompressMask.size() >= 2 && "At least two elements are required");
6723 SmallVector<Value *> OrderedPointerOps(PointerOps);
6724 if (!Order.empty())
6725 reorderScalars(OrderedPointerOps, Mask);
6726 auto [ScalarGEPCost, VectorGEPCost] =
6727 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6728 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6729 // The cost of scalar loads.
6730 InstructionCost ScalarLoadsCost =
6731 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6732 [&](InstructionCost C, Value *V) {
6733 return C + TTI.getInstructionCost(cast<Instruction>(V),
6734 CostKind);
6735 }) +
6736 ScalarGEPCost;
6737 APInt DemandedElts = APInt::getAllOnes(Sz);
6738 InstructionCost GatherCost =
6739 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6740 /*Insert=*/true,
6741 /*Extract=*/false, CostKind) +
6742 ScalarLoadsCost;
6743 InstructionCost LoadCost = 0;
6744 if (IsMasked) {
6745 LoadCost =
6746 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6747 LI->getPointerAddressSpace(), CostKind);
6748 } else {
6749 LoadCost =
6750 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6751 LI->getPointerAddressSpace(), CostKind);
6752 }
6753 if (IsStrided && !IsMasked && Order.empty()) {
6754 // Check for potential segmented(interleaved) loads.
6755 VectorType *AlignedLoadVecTy = getWidenedType(
6756 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6757 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6758 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6759 &TLI))
6760 AlignedLoadVecTy = LoadVecTy;
6761 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6762 CommonAlignment,
6763 LI->getPointerAddressSpace())) {
6764 InstructionCost InterleavedCost =
6765 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6766 Instruction::Load, AlignedLoadVecTy,
6767 CompressMask[1], {}, CommonAlignment,
6768 LI->getPointerAddressSpace(), CostKind, IsMasked);
6769 if (InterleavedCost < GatherCost) {
6770 InterleaveFactor = CompressMask[1];
6771 LoadVecTy = AlignedLoadVecTy;
6772 return true;
6773 }
6774 }
6775 }
6776 InstructionCost CompressCost = ::getShuffleCost(
6777 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
6778 if (!Order.empty()) {
6779 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6780 for (unsigned I : seq<unsigned>(Sz)) {
6781 NewMask[I] = CompressMask[Mask[I]];
6782 }
6783 CompressMask.swap(NewMask);
6784 }
6785 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6786 return TotalVecCost < GatherCost;
6787}
6788
6789/// Checks if the \p VL can be transformed to a (masked)load + compress or
6790/// (masked) interleaved load.
6791static bool
6794 const DataLayout &DL, ScalarEvolution &SE,
6795 AssumptionCache &AC, const DominatorTree &DT,
6796 const TargetLibraryInfo &TLI,
6797 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6798 bool IsMasked;
6799 unsigned InterleaveFactor;
6800 SmallVector<int> CompressMask;
6801 VectorType *LoadVecTy;
6802 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6803 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6804 CompressMask, LoadVecTy);
6805}
6806
6807/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6808/// PointerOps:
6809/// 1. Target with strided load support is detected.
6810/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6811/// potential stride <= MaxProfitableLoadStride and the potential stride is
6812/// power-of-2 (to avoid perf regressions for the very small number of loads)
6813/// and max distance > number of loads, or potential stride is -1.
6814/// 3. The loads are ordered, or number of unordered loads <=
6815/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6816/// to avoid extra costs for very expensive shuffles).
6817/// 4. Any pointer operand is an instruction with the users outside of the
6818/// current graph (for masked gathers extra extractelement instructions
6819/// might be required).
6821 ArrayRef<unsigned> Order,
6822 const TargetTransformInfo &TTI,
6823 const DataLayout &DL, ScalarEvolution &SE,
6824 const int64_t Diff,
6825 StridedPtrInfo &SPtrInfo) const {
6826 const size_t Sz = VL.size();
6827 if (Diff % (Sz - 1) != 0)
6828 return false;
6829
6830 // Try to generate strided load node.
6831 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
6832 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
6833 return !isVectorized(U) && !MustGather.contains(U);
6834 });
6835 });
6836
6837 const uint64_t AbsoluteDiff = std::abs(Diff);
6838 Type *ScalarTy = VL.front()->getType();
6839 auto *VecTy = getWidenedType(ScalarTy, Sz);
6840 if (IsAnyPointerUsedOutGraph ||
6841 (AbsoluteDiff > Sz &&
6843 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
6844 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
6845 Diff == -(static_cast<int64_t>(Sz) - 1)) {
6846 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6847 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
6848 return false;
6849 Align Alignment =
6850 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
6851 ->getAlign();
6852 if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
6853 return false;
6854 Value *Ptr0;
6855 Value *PtrN;
6856 if (Order.empty()) {
6857 Ptr0 = PointerOps.front();
6858 PtrN = PointerOps.back();
6859 } else {
6860 Ptr0 = PointerOps[Order.front()];
6861 PtrN = PointerOps[Order.back()];
6862 }
6863 // Iterate through all pointers and check if all distances are
6864 // unique multiple of Dist.
6866 for (Value *Ptr : PointerOps) {
6867 int64_t Dist = 0;
6868 if (Ptr == PtrN)
6869 Dist = Diff;
6870 else if (Ptr != Ptr0)
6871 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6872 // If the strides are not the same or repeated, we can't
6873 // vectorize.
6874 if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
6875 break;
6876 }
6877 if (Dists.size() == Sz) {
6878 Type *StrideTy = DL.getIndexType(Ptr0->getType());
6879 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6880 SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
6881 return true;
6882 }
6883 }
6884 return false;
6885}
6886
6888 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
6889 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
6890 unsigned *BestVF, bool TryRecursiveCheck) const {
6891 // Check that a vectorized load would load the same memory as a scalar
6892 // load. For example, we don't want to vectorize loads that are smaller
6893 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6894 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6895 // from such a struct, we read/write packed bits disagreeing with the
6896 // unvectorized version.
6897 if (BestVF)
6898 *BestVF = 0;
6900 return LoadsState::Gather;
6901 Type *ScalarTy = VL0->getType();
6902
6903 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
6904 return LoadsState::Gather;
6905
6906 // Make sure all loads in the bundle are simple - we can't vectorize
6907 // atomic or volatile loads.
6908 PointerOps.clear();
6909 const size_t Sz = VL.size();
6910 PointerOps.resize(Sz);
6911 auto *POIter = PointerOps.begin();
6912 for (Value *V : VL) {
6913 auto *L = dyn_cast<LoadInst>(V);
6914 if (!L || !L->isSimple())
6915 return LoadsState::Gather;
6916 *POIter = L->getPointerOperand();
6917 ++POIter;
6918 }
6919
6920 Order.clear();
6921 // Check the order of pointer operands or that all pointers are the same.
6922 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
6923
6924 auto *VecTy = getWidenedType(ScalarTy, Sz);
6925 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
6926 if (!IsSorted) {
6927 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
6928 if (const SCEV *Stride =
6929 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
6930 Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
6931 SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
6932 SPtrInfo.StrideSCEV = Stride;
6934 }
6935 }
6936
6937 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6938 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6939 return LoadsState::Gather;
6940
6941 if (!all_of(PointerOps, [&](Value *P) {
6942 return arePointersCompatible(P, PointerOps.front(), *TLI);
6943 }))
6944 return LoadsState::Gather;
6945
6946 } else {
6947 Value *Ptr0;
6948 Value *PtrN;
6949 if (Order.empty()) {
6950 Ptr0 = PointerOps.front();
6951 PtrN = PointerOps.back();
6952 } else {
6953 Ptr0 = PointerOps[Order.front()];
6954 PtrN = PointerOps[Order.back()];
6955 }
6956 std::optional<int64_t> Diff =
6957 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6958 // Check that the sorted loads are consecutive.
6959 if (static_cast<uint64_t>(*Diff) == Sz - 1)
6960 return LoadsState::Vectorize;
6961 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
6962 *TLI, [&](Value *V) {
6963 return areAllUsersVectorized(
6964 cast<Instruction>(V), UserIgnoreList);
6965 }))
6967 if (isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE, *Diff, SPtrInfo))
6969 }
6970 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6971 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6972 return LoadsState::Gather;
6973 // Correctly identify compare the cost of loads + shuffles rather than
6974 // strided/masked gather loads. Returns true if vectorized + shuffles
6975 // representation is better than just gather.
6976 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
6977 unsigned *BestVF,
6978 bool ProfitableGatherPointers) {
6979 if (BestVF)
6980 *BestVF = 0;
6981 // Compare masked gather cost and loads + insert subvector costs.
6983 auto [ScalarGEPCost, VectorGEPCost] =
6984 getGEPCosts(TTI, PointerOps, PointerOps.front(),
6985 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
6986 // Estimate the cost of masked gather GEP. If not a splat, roughly
6987 // estimate as a buildvector, otherwise estimate as splat.
6988 APInt DemandedElts = APInt::getAllOnes(Sz);
6989 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
6990 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
6991 if (static_cast<unsigned>(count_if(
6992 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
6993 any_of(PointerOps, [&](Value *V) {
6994 return getUnderlyingObject(V) !=
6995 getUnderlyingObject(PointerOps.front());
6996 }))
6997 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
6998 DemandedElts, /*Insert=*/true,
6999 /*Extract=*/false, CostKind);
7000 else
7001 VectorGEPCost +=
7003 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7004 /*Insert=*/true, /*Extract=*/false, CostKind) +
7005 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7006 // The cost of scalar loads.
7007 InstructionCost ScalarLoadsCost =
7008 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7009 [&](InstructionCost C, Value *V) {
7010 return C + TTI.getInstructionCost(
7012 }) +
7013 ScalarGEPCost;
7014 // The cost of masked gather.
7015 InstructionCost MaskedGatherCost =
7016 TTI.getGatherScatterOpCost(
7017 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
7018 /*VariableMask=*/false, CommonAlignment, CostKind) +
7019 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7020 InstructionCost GatherCost =
7021 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7022 /*Insert=*/true,
7023 /*Extract=*/false, CostKind) +
7024 ScalarLoadsCost;
7025 // The list of loads is small or perform partial check already - directly
7026 // compare masked gather cost and gather cost.
7027 constexpr unsigned ListLimit = 4;
7028 if (!TryRecursiveCheck || VL.size() < ListLimit)
7029 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7030
7031 // FIXME: The following code has not been updated for non-power-of-2
7032 // vectors (and not whole registers). The splitting logic here does not
7033 // cover the original vector if the vector factor is not a power of two.
7034 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7035 return false;
7036
7037 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7038 unsigned MinVF = getMinVF(2 * Sz);
7039 DemandedElts.clearAllBits();
7040 // Iterate through possible vectorization factors and check if vectorized +
7041 // shuffles is better than just gather.
7042 for (unsigned VF =
7043 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7044 VF >= MinVF;
7045 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7047 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7048 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7050 SmallVector<Value *> PointerOps;
7051 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7052 PointerOps, SPtrInfo, BestVF,
7053 /*TryRecursiveCheck=*/false);
7054 // Check that the sorted loads are consecutive.
7055 if (LS == LoadsState::Gather) {
7056 if (BestVF) {
7057 DemandedElts.setAllBits();
7058 break;
7059 }
7060 DemandedElts.setBits(Cnt, Cnt + VF);
7061 continue;
7062 }
7063 // If need the reorder - consider as high-cost masked gather for now.
7064 if ((LS == LoadsState::Vectorize ||
7067 !Order.empty() && !isReverseOrder(Order))
7069 States.push_back(LS);
7070 }
7071 if (DemandedElts.isAllOnes())
7072 // All loads gathered - try smaller VF.
7073 continue;
7074 // Can be vectorized later as a serie of loads/insertelements.
7075 InstructionCost VecLdCost = 0;
7076 if (!DemandedElts.isZero()) {
7077 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7078 /*Insert=*/true,
7079 /*Extract=*/false, CostKind) +
7080 ScalarGEPCost;
7081 for (unsigned Idx : seq<unsigned>(VL.size()))
7082 if (DemandedElts[Idx])
7083 VecLdCost +=
7084 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7085 }
7086 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7087 for (auto [I, LS] : enumerate(States)) {
7088 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7089 InstructionCost VectorGEPCost =
7090 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7091 ? 0
7092 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7093 LI0->getPointerOperand(),
7094 Instruction::GetElementPtr, CostKind, ScalarTy,
7095 SubVecTy)
7096 .second;
7097 if (LS == LoadsState::ScatterVectorize) {
7098 if (static_cast<unsigned>(
7099 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7100 PointerOps.size() - 1 ||
7101 any_of(PointerOps, [&](Value *V) {
7102 return getUnderlyingObject(V) !=
7103 getUnderlyingObject(PointerOps.front());
7104 }))
7105 VectorGEPCost += getScalarizationOverhead(
7106 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7107 /*Insert=*/true, /*Extract=*/false, CostKind);
7108 else
7109 VectorGEPCost +=
7111 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7112 /*Insert=*/true, /*Extract=*/false, CostKind) +
7113 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7114 CostKind);
7115 }
7116 switch (LS) {
7118 VecLdCost +=
7119 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7120 LI0->getPointerAddressSpace(), CostKind,
7122 VectorGEPCost;
7123 break;
7125 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7126 LI0->getPointerOperand(),
7127 /*VariableMask=*/false,
7128 CommonAlignment, CostKind) +
7129 VectorGEPCost;
7130 break;
7132 VecLdCost += TTI.getMaskedMemoryOpCost(
7133 Instruction::Load, SubVecTy, CommonAlignment,
7134 LI0->getPointerAddressSpace(), CostKind) +
7135 VectorGEPCost +
7137 {}, CostKind);
7138 break;
7140 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7141 LI0->getPointerOperand(),
7142 /*VariableMask=*/false,
7143 CommonAlignment, CostKind) +
7144 VectorGEPCost;
7145 break;
7146 case LoadsState::Gather:
7147 // Gathers are already calculated - ignore.
7148 continue;
7149 }
7150 SmallVector<int> ShuffleMask(VL.size());
7151 for (int Idx : seq<int>(0, VL.size()))
7152 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7153 if (I > 0)
7154 VecLdCost +=
7155 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7156 CostKind, I * VF, SubVecTy);
7157 }
7158 // If masked gather cost is higher - better to vectorize, so
7159 // consider it as a gather node. It will be better estimated
7160 // later.
7161 if (MaskedGatherCost >= VecLdCost &&
7162 VecLdCost - GatherCost < -SLPCostThreshold) {
7163 if (BestVF)
7164 *BestVF = VF;
7165 return true;
7166 }
7167 }
7168 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7169 };
7170 // TODO: need to improve analysis of the pointers, if not all of them are
7171 // GEPs or have > 2 operands, we end up with a gather node, which just
7172 // increases the cost.
7173 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7174 bool ProfitableGatherPointers =
7175 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7176 return L->isLoopInvariant(V);
7177 })) <= Sz / 2;
7178 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7180 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7181 (GEP && GEP->getNumOperands() == 2 &&
7182 isa<Constant, Instruction>(GEP->getOperand(1)));
7183 })) {
7184 // Check if potential masked gather can be represented as series
7185 // of loads + insertsubvectors.
7186 // If masked gather cost is higher - better to vectorize, so
7187 // consider it as a gather node. It will be better estimated
7188 // later.
7189 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7190 ProfitableGatherPointers))
7192 }
7193
7194 return LoadsState::Gather;
7195}
7196
7198 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7199 const DataLayout &DL, ScalarEvolution &SE,
7200 SmallVectorImpl<unsigned> &SortedIndices) {
7201 assert(
7202 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7203 "Expected list of pointer operands.");
7204 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7205 // Ptr into, sort and return the sorted indices with values next to one
7206 // another.
7208 std::pair<BasicBlock *, Value *>,
7210 Bases;
7211 Bases
7212 .try_emplace(std::make_pair(
7214 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7215
7216 SortedIndices.clear();
7217 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7218 auto Key = std::make_pair(BBs[Cnt + 1],
7220 bool Found = any_of(Bases.try_emplace(Key).first->second,
7221 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7222 std::optional<int64_t> Diff =
7223 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7224 ElemTy, Ptr, DL, SE,
7225 /*StrictCheck=*/true);
7226 if (!Diff)
7227 return false;
7228
7229 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7230 return true;
7231 });
7232
7233 if (!Found) {
7234 // If we haven't found enough to usefully cluster, return early.
7235 if (Bases.size() > VL.size() / 2 - 1)
7236 return false;
7237
7238 // Not found already - add a new Base
7239 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7240 }
7241 }
7242
7243 if (Bases.size() == VL.size())
7244 return false;
7245
7246 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7247 Bases.front().second.size() == VL.size()))
7248 return false;
7249
7250 // For each of the bases sort the pointers by Offset and check if any of the
7251 // base become consecutively allocated.
7252 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7253 SmallPtrSet<Value *, 13> FirstPointers;
7254 SmallPtrSet<Value *, 13> SecondPointers;
7255 Value *P1 = Ptr1;
7256 Value *P2 = Ptr2;
7257 unsigned Depth = 0;
7258 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7259 if (P1 == P2 || Depth > RecursionMaxDepth)
7260 return false;
7261 FirstPointers.insert(P1);
7262 SecondPointers.insert(P2);
7263 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7264 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7265 ++Depth;
7266 }
7267 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7268 "Unable to find matching root.");
7269 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7270 };
7271 for (auto &Base : Bases) {
7272 for (auto &Vec : Base.second) {
7273 if (Vec.size() > 1) {
7275 int64_t InitialOffset = std::get<1>(Vec[0]);
7276 bool AnyConsecutive =
7277 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7278 return std::get<1>(P.value()) ==
7279 int64_t(P.index()) + InitialOffset;
7280 });
7281 // Fill SortedIndices array only if it looks worth-while to sort the
7282 // ptrs.
7283 if (!AnyConsecutive)
7284 return false;
7285 }
7286 }
7287 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7288 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7289 });
7290 }
7291
7292 for (auto &T : Bases)
7293 for (const auto &Vec : T.second)
7294 for (const auto &P : Vec)
7295 SortedIndices.push_back(std::get<2>(P));
7296
7297 assert(SortedIndices.size() == VL.size() &&
7298 "Expected SortedIndices to be the size of VL");
7299 return true;
7300}
7301
7302std::optional<BoUpSLP::OrdersType>
7303BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7304 assert(TE.isGather() && "Expected gather node only.");
7305 Type *ScalarTy = TE.Scalars[0]->getType();
7306
7308 Ptrs.reserve(TE.Scalars.size());
7310 BBs.reserve(TE.Scalars.size());
7311 for (Value *V : TE.Scalars) {
7312 auto *L = dyn_cast<LoadInst>(V);
7313 if (!L || !L->isSimple())
7314 return std::nullopt;
7315 Ptrs.push_back(L->getPointerOperand());
7316 BBs.push_back(L->getParent());
7317 }
7318
7319 BoUpSLP::OrdersType Order;
7320 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7321 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7322 return std::move(Order);
7323 return std::nullopt;
7324}
7325
7326/// Check if two insertelement instructions are from the same buildvector.
7329 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7330 // Instructions must be from the same basic blocks.
7331 if (VU->getParent() != V->getParent())
7332 return false;
7333 // Checks if 2 insertelements are from the same buildvector.
7334 if (VU->getType() != V->getType())
7335 return false;
7336 // Multiple used inserts are separate nodes.
7337 if (!VU->hasOneUse() && !V->hasOneUse())
7338 return false;
7339 auto *IE1 = VU;
7340 auto *IE2 = V;
7341 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7342 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7343 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7344 return false;
7345 // Go through the vector operand of insertelement instructions trying to find
7346 // either VU as the original vector for IE2 or V as the original vector for
7347 // IE1.
7348 SmallBitVector ReusedIdx(
7349 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7350 bool IsReusedIdx = false;
7351 do {
7352 if (IE2 == VU && !IE1)
7353 return VU->hasOneUse();
7354 if (IE1 == V && !IE2)
7355 return V->hasOneUse();
7356 if (IE1 && IE1 != V) {
7357 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7358 IsReusedIdx |= ReusedIdx.test(Idx1);
7359 ReusedIdx.set(Idx1);
7360 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7361 IE1 = nullptr;
7362 else
7363 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7364 }
7365 if (IE2 && IE2 != VU) {
7366 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7367 IsReusedIdx |= ReusedIdx.test(Idx2);
7368 ReusedIdx.set(Idx2);
7369 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7370 IE2 = nullptr;
7371 else
7372 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7373 }
7374 } while (!IsReusedIdx && (IE1 || IE2));
7375 return false;
7376}
7377
7378/// Checks if the specified instruction \p I is an alternate operation for
7379/// the given \p MainOp and \p AltOp instructions.
7380static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7381 Instruction *AltOp,
7382 const TargetLibraryInfo &TLI);
7383
7384std::optional<BoUpSLP::OrdersType>
7385BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7386 bool IgnoreReorder) {
7387 // No need to reorder if need to shuffle reuses, still need to shuffle the
7388 // node.
7389 if (!TE.ReuseShuffleIndices.empty()) {
7390 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7391 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7392 "Reshuffling scalars not yet supported for nodes with padding");
7393
7394 if (isSplat(TE.Scalars))
7395 return std::nullopt;
7396 // Check if reuse shuffle indices can be improved by reordering.
7397 // For this, check that reuse mask is "clustered", i.e. each scalar values
7398 // is used once in each submask of size <number_of_scalars>.
7399 // Example: 4 scalar values.
7400 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7401 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7402 // element 3 is used twice in the second submask.
7403 unsigned Sz = TE.Scalars.size();
7404 if (TE.isGather()) {
7405 if (std::optional<OrdersType> CurrentOrder =
7406 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7407 SmallVector<int> Mask;
7408 fixupOrderingIndices(*CurrentOrder);
7409 inversePermutation(*CurrentOrder, Mask);
7410 ::addMask(Mask, TE.ReuseShuffleIndices);
7411 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7412 unsigned Sz = TE.Scalars.size();
7413 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7414 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7415 if (Idx != PoisonMaskElem)
7416 Res[Idx + K * Sz] = I + K * Sz;
7417 }
7418 return std::move(Res);
7419 }
7420 }
7421 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7422 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7423 2 * TE.getVectorFactor())) == 1)
7424 return std::nullopt;
7425 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7426 return std::nullopt;
7427 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7428 Sz)) {
7429 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7430 if (TE.ReorderIndices.empty())
7431 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7432 else
7433 inversePermutation(TE.ReorderIndices, ReorderMask);
7434 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7435 unsigned VF = ReorderMask.size();
7436 OrdersType ResOrder(VF, VF);
7437 unsigned NumParts = divideCeil(VF, Sz);
7438 SmallBitVector UsedVals(NumParts);
7439 for (unsigned I = 0; I < VF; I += Sz) {
7440 int Val = PoisonMaskElem;
7441 unsigned UndefCnt = 0;
7442 unsigned Limit = std::min(Sz, VF - I);
7443 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7444 [&](int Idx) {
7445 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7446 Val = Idx;
7447 if (Idx == PoisonMaskElem)
7448 ++UndefCnt;
7449 return Idx != PoisonMaskElem && Idx != Val;
7450 }) ||
7451 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7452 UndefCnt > Sz / 2)
7453 return std::nullopt;
7454 UsedVals.set(Val);
7455 for (unsigned K = 0; K < NumParts; ++K) {
7456 unsigned Idx = Val + Sz * K;
7457 if (Idx < VF && I + K < VF)
7458 ResOrder[Idx] = I + K;
7459 }
7460 }
7461 return std::move(ResOrder);
7462 }
7463 unsigned VF = TE.getVectorFactor();
7464 // Try build correct order for extractelement instructions.
7465 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7466 TE.ReuseShuffleIndices.end());
7467 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7468 all_of(TE.Scalars, [Sz](Value *V) {
7469 if (isa<PoisonValue>(V))
7470 return true;
7471 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7472 return Idx && *Idx < Sz;
7473 })) {
7474 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7475 "by BinaryOperator and CastInst.");
7476 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7477 if (TE.ReorderIndices.empty())
7478 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7479 else
7480 inversePermutation(TE.ReorderIndices, ReorderMask);
7481 for (unsigned I = 0; I < VF; ++I) {
7482 int &Idx = ReusedMask[I];
7483 if (Idx == PoisonMaskElem)
7484 continue;
7485 Value *V = TE.Scalars[ReorderMask[Idx]];
7486 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7487 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7488 }
7489 }
7490 // Build the order of the VF size, need to reorder reuses shuffles, they are
7491 // always of VF size.
7492 OrdersType ResOrder(VF);
7493 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7494 auto *It = ResOrder.begin();
7495 for (unsigned K = 0; K < VF; K += Sz) {
7496 OrdersType CurrentOrder(TE.ReorderIndices);
7497 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7498 if (SubMask.front() == PoisonMaskElem)
7499 std::iota(SubMask.begin(), SubMask.end(), 0);
7500 reorderOrder(CurrentOrder, SubMask);
7501 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7502 std::advance(It, Sz);
7503 }
7504 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7505 return Data.index() == Data.value();
7506 }))
7507 return std::nullopt; // No need to reorder.
7508 return std::move(ResOrder);
7509 }
7510 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7511 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7512 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
7513 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
7514 return std::nullopt;
7515 if (TE.State == TreeEntry::SplitVectorize ||
7516 ((TE.State == TreeEntry::Vectorize ||
7517 TE.State == TreeEntry::StridedVectorize ||
7518 TE.State == TreeEntry::CompressVectorize) &&
7520 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7521 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7522 "Alternate instructions are only supported by "
7523 "BinaryOperator and CastInst.");
7524 return TE.ReorderIndices;
7525 }
7526 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7527 TE.isAltShuffle()) {
7528 assert(TE.ReuseShuffleIndices.empty() &&
7529 "ReuseShuffleIndices should be "
7530 "empty for alternate instructions.");
7531 SmallVector<int> Mask;
7532 TE.buildAltOpShuffleMask(
7533 [&](Instruction *I) {
7534 assert(TE.getMatchingMainOpOrAltOp(I) &&
7535 "Unexpected main/alternate opcode");
7536 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
7537 },
7538 Mask);
7539 const int VF = TE.getVectorFactor();
7540 OrdersType ResOrder(VF, VF);
7541 for (unsigned I : seq<unsigned>(VF)) {
7542 if (Mask[I] == PoisonMaskElem)
7543 continue;
7544 ResOrder[Mask[I] % VF] = I;
7545 }
7546 return std::move(ResOrder);
7547 }
7548 if (!TE.ReorderIndices.empty())
7549 return TE.ReorderIndices;
7550 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7551 if (!TE.ReorderIndices.empty())
7552 return TE.ReorderIndices;
7553
7554 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
7555 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
7556 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7557 continue;
7558 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
7559 if (!II)
7560 continue;
7561 Instruction *BVHead = nullptr;
7562 BasicBlock *BB = II->getParent();
7563 while (II && II->hasOneUse() && II->getParent() == BB) {
7564 BVHead = II;
7565 II = dyn_cast<InsertElementInst>(II->getOperand(0));
7566 }
7567 I = BVHead;
7568 }
7569
7570 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
7571 assert(BB1 != BB2 && "Expected different basic blocks.");
7572 if (!DT->isReachableFromEntry(BB1))
7573 return false;
7574 if (!DT->isReachableFromEntry(BB2))
7575 return true;
7576 auto *NodeA = DT->getNode(BB1);
7577 auto *NodeB = DT->getNode(BB2);
7578 assert(NodeA && "Should only process reachable instructions");
7579 assert(NodeB && "Should only process reachable instructions");
7580 assert((NodeA == NodeB) ==
7581 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7582 "Different nodes should have different DFS numbers");
7583 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7584 };
7585 auto PHICompare = [&](unsigned I1, unsigned I2) {
7586 Value *V1 = TE.Scalars[I1];
7587 Value *V2 = TE.Scalars[I2];
7588 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
7589 return false;
7590 if (isa<PoisonValue>(V1))
7591 return true;
7592 if (isa<PoisonValue>(V2))
7593 return false;
7594 if (V1->getNumUses() < V2->getNumUses())
7595 return true;
7596 if (V1->getNumUses() > V2->getNumUses())
7597 return false;
7598 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
7599 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
7600 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7601 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7602 FirstUserOfPhi2->getParent());
7603 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7604 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7605 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7606 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7607 if (IE1 && !IE2)
7608 return true;
7609 if (!IE1 && IE2)
7610 return false;
7611 if (IE1 && IE2) {
7612 if (UserBVHead[I1] && !UserBVHead[I2])
7613 return true;
7614 if (!UserBVHead[I1])
7615 return false;
7616 if (UserBVHead[I1] == UserBVHead[I2])
7617 return getElementIndex(IE1) < getElementIndex(IE2);
7618 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
7619 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
7620 UserBVHead[I2]->getParent());
7621 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7622 }
7623 if (EE1 && !EE2)
7624 return true;
7625 if (!EE1 && EE2)
7626 return false;
7627 if (EE1 && EE2) {
7628 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7629 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7630 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7631 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7632 if (!Inst2 && !P2)
7633 return Inst1 || P1;
7634 if (EE1->getOperand(0) == EE2->getOperand(0))
7635 return getElementIndex(EE1) < getElementIndex(EE2);
7636 if (!Inst1 && Inst2)
7637 return false;
7638 if (Inst1 && Inst2) {
7639 if (Inst1->getParent() != Inst2->getParent())
7640 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
7641 return Inst1->comesBefore(Inst2);
7642 }
7643 if (!P1 && P2)
7644 return false;
7645 assert(P1 && P2 &&
7646 "Expected either instructions or arguments vector operands.");
7647 return P1->getArgNo() < P2->getArgNo();
7648 }
7649 return false;
7650 };
7651 OrdersType Phis(TE.Scalars.size());
7652 std::iota(Phis.begin(), Phis.end(), 0);
7653 stable_sort(Phis, PHICompare);
7654 if (isIdentityOrder(Phis))
7655 return std::nullopt; // No need to reorder.
7656 return std::move(Phis);
7657 }
7658 if (TE.isGather() &&
7659 (!TE.hasState() || !TE.isAltShuffle() ||
7660 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7661 allSameType(TE.Scalars)) {
7662 // TODO: add analysis of other gather nodes with extractelement
7663 // instructions and other values/instructions, not only undefs.
7664 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7666 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7667 all_of(TE.Scalars, [](Value *V) {
7668 auto *EE = dyn_cast<ExtractElementInst>(V);
7669 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7670 })) {
7671 // Check that gather of extractelements can be represented as
7672 // just a shuffle of a single vector.
7673 OrdersType CurrentOrder;
7674 bool Reuse =
7675 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
7676 if (Reuse || !CurrentOrder.empty())
7677 return std::move(CurrentOrder);
7678 }
7679 // If the gather node is <undef, v, .., poison> and
7680 // insertelement poison, v, 0 [+ permute]
7681 // is cheaper than
7682 // insertelement poison, v, n - try to reorder.
7683 // If rotating the whole graph, exclude the permute cost, the whole graph
7684 // might be transformed.
7685 int Sz = TE.Scalars.size();
7686 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
7687 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7688 const auto *It = find_if_not(TE.Scalars, isConstant);
7689 if (It == TE.Scalars.begin())
7690 return OrdersType();
7691 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
7692 if (It != TE.Scalars.end()) {
7693 OrdersType Order(Sz, Sz);
7694 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7695 Order[Idx] = 0;
7696 fixupOrderingIndices(Order);
7697 SmallVector<int> Mask;
7698 inversePermutation(Order, Mask);
7699 InstructionCost PermuteCost =
7700 TopToBottom
7701 ? 0
7702 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
7703 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
7704 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
7705 PoisonValue::get(Ty), *It);
7706 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
7707 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
7708 PoisonValue::get(Ty), *It);
7709 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7710 OrdersType Order(Sz, Sz);
7711 Order[Idx] = 0;
7712 return std::move(Order);
7713 }
7714 }
7715 }
7716 if (isSplat(TE.Scalars))
7717 return std::nullopt;
7718 if (TE.Scalars.size() >= 3)
7719 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
7720 return Order;
7721 // Check if can include the order of vectorized loads. For masked gathers do
7722 // extra analysis later, so include such nodes into a special list.
7723 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7724 SmallVector<Value *> PointerOps;
7725 StridedPtrInfo SPtrInfo;
7726 OrdersType CurrentOrder;
7727 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7728 CurrentOrder, PointerOps, SPtrInfo);
7731 return std::move(CurrentOrder);
7732 }
7733 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
7734 // has been auditted for correctness with non-power-of-two vectors.
7735 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
7736 if (std::optional<OrdersType> CurrentOrder =
7737 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
7738 return CurrentOrder;
7739 }
7740 return std::nullopt;
7741}
7742
7743/// Checks if the given mask is a "clustered" mask with the same clusters of
7744/// size \p Sz, which are not identity submasks.
7746 unsigned Sz) {
7747 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
7748 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
7749 return false;
7750 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
7751 ArrayRef<int> Cluster = Mask.slice(I, Sz);
7752 if (Cluster != FirstCluster)
7753 return false;
7754 }
7755 return true;
7756}
7757
7758void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
7759 // Reorder reuses mask.
7760 reorderReuses(TE.ReuseShuffleIndices, Mask);
7761 const unsigned Sz = TE.Scalars.size();
7762 // For vectorized and non-clustered reused no need to do anything else.
7763 if (!TE.isGather() ||
7765 Sz) ||
7766 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
7767 return;
7768 SmallVector<int> NewMask;
7769 inversePermutation(TE.ReorderIndices, NewMask);
7770 addMask(NewMask, TE.ReuseShuffleIndices);
7771 // Clear reorder since it is going to be applied to the new mask.
7772 TE.ReorderIndices.clear();
7773 // Try to improve gathered nodes with clustered reuses, if possible.
7774 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
7775 SmallVector<unsigned> NewOrder(Slice);
7776 inversePermutation(NewOrder, NewMask);
7777 reorderScalars(TE.Scalars, NewMask);
7778 // Fill the reuses mask with the identity submasks.
7779 for (auto *It = TE.ReuseShuffleIndices.begin(),
7780 *End = TE.ReuseShuffleIndices.end();
7781 It != End; std::advance(It, Sz))
7782 std::iota(It, std::next(It, Sz), 0);
7783}
7784
7786 ArrayRef<unsigned> SecondaryOrder) {
7787 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
7788 "Expected same size of orders");
7789 size_t Sz = Order.size();
7790 SmallBitVector UsedIndices(Sz);
7791 for (unsigned Idx : seq<unsigned>(0, Sz)) {
7792 if (Order[Idx] != Sz)
7793 UsedIndices.set(Order[Idx]);
7794 }
7795 if (SecondaryOrder.empty()) {
7796 for (unsigned Idx : seq<unsigned>(0, Sz))
7797 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
7798 Order[Idx] = Idx;
7799 } else {
7800 for (unsigned Idx : seq<unsigned>(0, Sz))
7801 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7802 !UsedIndices.test(SecondaryOrder[Idx]))
7803 Order[Idx] = SecondaryOrder[Idx];
7804 }
7805}
7806
7809 return false;
7810
7811 constexpr unsigned TinyVF = 2;
7812 constexpr unsigned TinyTree = 10;
7813 constexpr unsigned PhiOpsLimit = 12;
7814 constexpr unsigned GatherLoadsLimit = 2;
7815 if (VectorizableTree.size() <= TinyTree)
7816 return true;
7817 if (VectorizableTree.front()->hasState() &&
7818 !VectorizableTree.front()->isGather() &&
7819 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7820 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7821 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7822 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7823 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7824 VectorizableTree.front()->ReorderIndices.empty()) {
7825 // Check if the tree has only single store and single (unordered) load node,
7826 // other nodes are phis or geps/binops, combined with phis, and/or single
7827 // gather load node
7828 if (VectorizableTree.front()->hasState() &&
7829 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7830 VectorizableTree.front()->Scalars.size() == TinyVF &&
7831 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7832 return false;
7833 // Single node, which require reorder - skip.
7834 if (VectorizableTree.front()->hasState() &&
7835 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7836 VectorizableTree.front()->ReorderIndices.empty()) {
7837 const unsigned ReorderedSplitsCnt =
7838 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7839 return TE->State == TreeEntry::SplitVectorize &&
7840 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7841 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7842 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
7843 });
7844 if (ReorderedSplitsCnt <= 1 &&
7845 static_cast<unsigned>(count_if(
7846 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7847 return ((!TE->isGather() &&
7848 (TE->ReorderIndices.empty() ||
7849 (TE->UserTreeIndex.UserTE &&
7850 TE->UserTreeIndex.UserTE->State ==
7851 TreeEntry::Vectorize &&
7852 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7853 .empty()))) ||
7854 (TE->isGather() && TE->ReorderIndices.empty() &&
7855 (!TE->hasState() || TE->isAltShuffle() ||
7856 TE->getOpcode() == Instruction::Load ||
7857 TE->getOpcode() == Instruction::ZExt ||
7858 TE->getOpcode() == Instruction::SExt))) &&
7859 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7860 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
7861 return !isConstant(V) && isVectorized(V);
7862 }));
7863 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7864 return false;
7865 }
7866 bool HasPhis = false;
7867 bool HasLoad = true;
7868 unsigned GatherLoads = 0;
7869 for (const std::unique_ptr<TreeEntry> &TE :
7870 ArrayRef(VectorizableTree).drop_front()) {
7871 if (TE->State == TreeEntry::SplitVectorize)
7872 continue;
7873 if (!TE->hasState()) {
7874 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
7876 continue;
7877 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7879 continue;
7880 return true;
7881 }
7882 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7883 if (!TE->isGather()) {
7884 HasLoad = false;
7885 continue;
7886 }
7887 if (HasLoad)
7888 return true;
7889 ++GatherLoads;
7890 if (GatherLoads >= GatherLoadsLimit)
7891 return true;
7892 }
7893 if (TE->getOpcode() == Instruction::GetElementPtr ||
7894 Instruction::isBinaryOp(TE->getOpcode()))
7895 continue;
7896 if (TE->getOpcode() != Instruction::PHI &&
7897 (!TE->hasCopyableElements() ||
7898 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
7899 TE->Scalars.size() / 2))
7900 return true;
7901 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7902 TE->getNumOperands() > PhiOpsLimit)
7903 return false;
7904 HasPhis = true;
7905 }
7906 return !HasPhis;
7907 }
7908 return true;
7909}
7910
7911void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
7912 ArrayRef<int> MaskOrder) {
7913 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
7914 SmallVector<int> NewMask(getVectorFactor());
7915 SmallVector<int> NewMaskOrder(getVectorFactor());
7916 std::iota(NewMask.begin(), NewMask.end(), 0);
7917 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7918 if (Idx == 0) {
7919 copy(Mask, NewMask.begin());
7920 copy(MaskOrder, NewMaskOrder.begin());
7921 } else {
7922 assert(Idx == 1 && "Expected either 0 or 1 index.");
7923 unsigned Offset = CombinedEntriesWithIndices.back().second;
7924 for (unsigned I : seq<unsigned>(Mask.size())) {
7925 NewMask[I + Offset] = Mask[I] + Offset;
7926 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
7927 }
7928 }
7929 reorderScalars(Scalars, NewMask);
7930 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
7931 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
7932 ReorderIndices.clear();
7933}
7934
7936 // Maps VF to the graph nodes.
7938 // ExtractElement gather nodes which can be vectorized and need to handle
7939 // their ordering.
7941
7942 // Phi nodes can have preferred ordering based on their result users
7944
7945 // AltShuffles can also have a preferred ordering that leads to fewer
7946 // instructions, e.g., the addsub instruction in x86.
7947 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
7948
7949 // Maps a TreeEntry to the reorder indices of external users.
7951 ExternalUserReorderMap;
7952 // Find all reorderable nodes with the given VF.
7953 // Currently the are vectorized stores,loads,extracts + some gathering of
7954 // extracts.
7955 for_each(VectorizableTree, [&, &TTIRef = *TTI](
7956 const std::unique_ptr<TreeEntry> &TE) {
7957 // Look for external users that will probably be vectorized.
7958 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
7959 findExternalStoreUsersReorderIndices(TE.get());
7960 if (!ExternalUserReorderIndices.empty()) {
7961 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7962 ExternalUserReorderMap.try_emplace(TE.get(),
7963 std::move(ExternalUserReorderIndices));
7964 }
7965
7966 // Patterns like [fadd,fsub] can be combined into a single instruction in
7967 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
7968 // to take into account their order when looking for the most used order.
7969 if (TE->hasState() && TE->isAltShuffle() &&
7970 TE->State != TreeEntry::SplitVectorize) {
7971 Type *ScalarTy = TE->Scalars[0]->getType();
7972 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
7973 unsigned Opcode0 = TE->getOpcode();
7974 unsigned Opcode1 = TE->getAltOpcode();
7975 SmallBitVector OpcodeMask(
7976 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
7977 // If this pattern is supported by the target then we consider the order.
7978 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7979 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7980 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
7981 }
7982 // TODO: Check the reverse order too.
7983 }
7984
7985 bool IgnoreReorder =
7986 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7987 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
7988 VectorizableTree.front()->getOpcode() == Instruction::Store);
7989 if (std::optional<OrdersType> CurrentOrder =
7990 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
7991 // Do not include ordering for nodes used in the alt opcode vectorization,
7992 // better to reorder them during bottom-to-top stage. If follow the order
7993 // here, it causes reordering of the whole graph though actually it is
7994 // profitable just to reorder the subgraph that starts from the alternate
7995 // opcode vectorization node. Such nodes already end-up with the shuffle
7996 // instruction and it is just enough to change this shuffle rather than
7997 // rotate the scalars for the whole graph.
7998 unsigned Cnt = 0;
7999 const TreeEntry *UserTE = TE.get();
8000 while (UserTE && Cnt < RecursionMaxDepth) {
8001 if (!UserTE->UserTreeIndex)
8002 break;
8003 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8004 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8005 UserTE->UserTreeIndex.UserTE->Idx != 0)
8006 return;
8007 UserTE = UserTE->UserTreeIndex.UserTE;
8008 ++Cnt;
8009 }
8010 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8011 if (!(TE->State == TreeEntry::Vectorize ||
8012 TE->State == TreeEntry::StridedVectorize ||
8013 TE->State == TreeEntry::SplitVectorize ||
8014 TE->State == TreeEntry::CompressVectorize) ||
8015 !TE->ReuseShuffleIndices.empty())
8016 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8017 if (TE->State == TreeEntry::Vectorize &&
8018 TE->getOpcode() == Instruction::PHI)
8019 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8020 }
8021 });
8022
8023 // Reorder the graph nodes according to their vectorization factor.
8024 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8025 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8026 auto It = VFToOrderedEntries.find(VF);
8027 if (It == VFToOrderedEntries.end())
8028 continue;
8029 // Try to find the most profitable order. We just are looking for the most
8030 // used order and reorder scalar elements in the nodes according to this
8031 // mostly used order.
8032 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8033 // Delete VF entry upon exit.
8034 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
8035
8036 // All operands are reordered and used only in this node - propagate the
8037 // most used order to the user node.
8040 OrdersUses;
8041 for (const TreeEntry *OpTE : OrderedEntries) {
8042 // No need to reorder this nodes, still need to extend and to use shuffle,
8043 // just need to merge reordering shuffle and the reuse shuffle.
8044 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8045 OpTE->State != TreeEntry::SplitVectorize)
8046 continue;
8047 // Count number of orders uses.
8048 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8049 &PhisToOrders]() -> const OrdersType & {
8050 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8051 auto It = GathersToOrders.find(OpTE);
8052 if (It != GathersToOrders.end())
8053 return It->second;
8054 }
8055 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8056 auto It = AltShufflesToOrders.find(OpTE);
8057 if (It != AltShufflesToOrders.end())
8058 return It->second;
8059 }
8060 if (OpTE->State == TreeEntry::Vectorize &&
8061 OpTE->getOpcode() == Instruction::PHI) {
8062 auto It = PhisToOrders.find(OpTE);
8063 if (It != PhisToOrders.end())
8064 return It->second;
8065 }
8066 return OpTE->ReorderIndices;
8067 }();
8068 // First consider the order of the external scalar users.
8069 auto It = ExternalUserReorderMap.find(OpTE);
8070 if (It != ExternalUserReorderMap.end()) {
8071 const auto &ExternalUserReorderIndices = It->second;
8072 // If the OpTE vector factor != number of scalars - use natural order,
8073 // it is an attempt to reorder node with reused scalars but with
8074 // external uses.
8075 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8076 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8077 ExternalUserReorderIndices.size();
8078 } else {
8079 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8080 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8081 }
8082 // No other useful reorder data in this entry.
8083 if (Order.empty())
8084 continue;
8085 }
8086 // Stores actually store the mask, not the order, need to invert.
8087 if (OpTE->State == TreeEntry::Vectorize &&
8088 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8089 assert(!OpTE->isAltShuffle() &&
8090 "Alternate instructions are only supported by BinaryOperator "
8091 "and CastInst.");
8092 SmallVector<int> Mask;
8093 inversePermutation(Order, Mask);
8094 unsigned E = Order.size();
8095 OrdersType CurrentOrder(E, E);
8096 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8097 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8098 });
8099 fixupOrderingIndices(CurrentOrder);
8100 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8101 } else {
8102 ++OrdersUses.try_emplace(Order, 0).first->second;
8103 }
8104 }
8105 if (OrdersUses.empty())
8106 continue;
8107 // Choose the most used order.
8108 unsigned IdentityCnt = 0;
8109 unsigned FilledIdentityCnt = 0;
8110 OrdersType IdentityOrder(VF, VF);
8111 for (auto &Pair : OrdersUses) {
8112 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8113 if (!Pair.first.empty())
8114 FilledIdentityCnt += Pair.second;
8115 IdentityCnt += Pair.second;
8116 combineOrders(IdentityOrder, Pair.first);
8117 }
8118 }
8119 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8120 unsigned Cnt = IdentityCnt;
8121 for (auto &Pair : OrdersUses) {
8122 // Prefer identity order. But, if filled identity found (non-empty order)
8123 // with same number of uses, as the new candidate order, we can choose
8124 // this candidate order.
8125 if (Cnt < Pair.second ||
8126 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8127 Cnt == Pair.second && !BestOrder.empty() &&
8128 isIdentityOrder(BestOrder))) {
8129 combineOrders(Pair.first, BestOrder);
8130 BestOrder = Pair.first;
8131 Cnt = Pair.second;
8132 } else {
8133 combineOrders(BestOrder, Pair.first);
8134 }
8135 }
8136 // Set order of the user node.
8137 if (isIdentityOrder(BestOrder))
8138 continue;
8139 fixupOrderingIndices(BestOrder);
8140 SmallVector<int> Mask;
8141 inversePermutation(BestOrder, Mask);
8142 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8143 unsigned E = BestOrder.size();
8144 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8145 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8146 });
8147 // Do an actual reordering, if profitable.
8148 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8149 // Just do the reordering for the nodes with the given VF.
8150 if (TE->Scalars.size() != VF) {
8151 if (TE->ReuseShuffleIndices.size() == VF) {
8152 assert(TE->State != TreeEntry::SplitVectorize &&
8153 "Split vectorized not expected.");
8154 // Need to reorder the reuses masks of the operands with smaller VF to
8155 // be able to find the match between the graph nodes and scalar
8156 // operands of the given node during vectorization/cost estimation.
8157 assert(
8158 (!TE->UserTreeIndex ||
8159 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8160 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8161 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8162 "All users must be of VF size.");
8163 if (SLPReVec) {
8164 assert(SLPReVec && "Only supported by REVEC.");
8165 // ShuffleVectorInst does not do reorderOperands (and it should not
8166 // because ShuffleVectorInst supports only a limited set of
8167 // patterns). Only do reorderNodeWithReuses if the user is not
8168 // ShuffleVectorInst.
8169 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8170 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8171 continue;
8172 }
8173 // Update ordering of the operands with the smaller VF than the given
8174 // one.
8175 reorderNodeWithReuses(*TE, Mask);
8176 // Update orders in user split vectorize nodes.
8177 if (TE->UserTreeIndex &&
8178 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8179 TE->UserTreeIndex.UserTE->reorderSplitNode(
8180 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8181 }
8182 continue;
8183 }
8184 if ((TE->State == TreeEntry::SplitVectorize &&
8185 TE->ReuseShuffleIndices.empty()) ||
8186 ((TE->State == TreeEntry::Vectorize ||
8187 TE->State == TreeEntry::StridedVectorize ||
8188 TE->State == TreeEntry::CompressVectorize) &&
8190 InsertElementInst>(TE->getMainOp()) ||
8191 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8192 assert(
8193 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8194 TE->ReuseShuffleIndices.empty())) &&
8195 "Alternate instructions are only supported by BinaryOperator "
8196 "and CastInst.");
8197 // Build correct orders for extract{element,value}, loads,
8198 // stores and alternate (split) nodes.
8199 reorderOrder(TE->ReorderIndices, Mask);
8200 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8201 TE->reorderOperands(Mask);
8202 } else {
8203 // Reorder the node and its operands.
8204 TE->reorderOperands(Mask);
8205 assert(TE->ReorderIndices.empty() &&
8206 "Expected empty reorder sequence.");
8207 reorderScalars(TE->Scalars, Mask);
8208 }
8209 if (!TE->ReuseShuffleIndices.empty()) {
8210 // Apply reversed order to keep the original ordering of the reused
8211 // elements to avoid extra reorder indices shuffling.
8212 OrdersType CurrentOrder;
8213 reorderOrder(CurrentOrder, MaskOrder);
8214 SmallVector<int> NewReuses;
8215 inversePermutation(CurrentOrder, NewReuses);
8216 addMask(NewReuses, TE->ReuseShuffleIndices);
8217 TE->ReuseShuffleIndices.swap(NewReuses);
8218 } else if (TE->UserTreeIndex &&
8219 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8220 // Update orders in user split vectorize nodes.
8221 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8222 Mask, MaskOrder);
8223 }
8224 }
8225}
8226
8227void BoUpSLP::buildReorderableOperands(
8228 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8229 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8230 SmallVectorImpl<TreeEntry *> &GatherOps) {
8231 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8232 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8233 return OpData.first == I &&
8234 (OpData.second->State == TreeEntry::Vectorize ||
8235 OpData.second->State == TreeEntry::StridedVectorize ||
8236 OpData.second->State == TreeEntry::CompressVectorize ||
8237 OpData.second->State == TreeEntry::SplitVectorize);
8238 }))
8239 continue;
8240 // Do not request operands, if they do not exist.
8241 if (UserTE->hasState()) {
8242 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8243 UserTE->getOpcode() == Instruction::ExtractValue)
8244 continue;
8245 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8246 continue;
8247 if (UserTE->getOpcode() == Instruction::Store &&
8248 UserTE->State == TreeEntry::Vectorize && I == 1)
8249 continue;
8250 if (UserTE->getOpcode() == Instruction::Load &&
8251 (UserTE->State == TreeEntry::Vectorize ||
8252 UserTE->State == TreeEntry::StridedVectorize ||
8253 UserTE->State == TreeEntry::CompressVectorize))
8254 continue;
8255 }
8256 TreeEntry *TE = getOperandEntry(UserTE, I);
8257 assert(TE && "Expected operand entry.");
8258 if (!TE->isGather()) {
8259 // Add the node to the list of the ordered nodes with the identity
8260 // order.
8261 Edges.emplace_back(I, TE);
8262 // Add ScatterVectorize nodes to the list of operands, where just
8263 // reordering of the scalars is required. Similar to the gathers, so
8264 // simply add to the list of gathered ops.
8265 // If there are reused scalars, process this node as a regular vectorize
8266 // node, just reorder reuses mask.
8267 if (TE->State == TreeEntry::ScatterVectorize &&
8268 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8269 GatherOps.push_back(TE);
8270 continue;
8271 }
8272 if (ReorderableGathers.contains(TE))
8273 GatherOps.push_back(TE);
8274 }
8275}
8276
8277void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8278 struct TreeEntryCompare {
8279 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8280 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8281 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8282 return LHS->Idx < RHS->Idx;
8283 }
8284 };
8286 DenseSet<const TreeEntry *> GathersToOrders;
8287 // Find all reorderable leaf nodes with the given VF.
8288 // Currently the are vectorized loads,extracts without alternate operands +
8289 // some gathering of extracts.
8291 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8292 if (TE->State != TreeEntry::Vectorize &&
8293 TE->State != TreeEntry::StridedVectorize &&
8294 TE->State != TreeEntry::CompressVectorize &&
8295 TE->State != TreeEntry::SplitVectorize)
8296 NonVectorized.insert(TE.get());
8297 if (std::optional<OrdersType> CurrentOrder =
8298 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8299 Queue.push(TE.get());
8300 if (!(TE->State == TreeEntry::Vectorize ||
8301 TE->State == TreeEntry::StridedVectorize ||
8302 TE->State == TreeEntry::CompressVectorize ||
8303 TE->State == TreeEntry::SplitVectorize) ||
8304 !TE->ReuseShuffleIndices.empty())
8305 GathersToOrders.insert(TE.get());
8306 }
8307 }
8308
8309 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8310 // I.e., if the node has operands, that are reordered, try to make at least
8311 // one operand order in the natural order and reorder others + reorder the
8312 // user node itself.
8313 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8314 while (!Queue.empty()) {
8315 // 1. Filter out only reordered nodes.
8316 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8317 TreeEntry *TE = Queue.top();
8318 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8319 Queue.pop();
8320 SmallVector<TreeEntry *> OrderedOps(1, TE);
8321 while (!Queue.empty()) {
8322 TE = Queue.top();
8323 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8324 break;
8325 Queue.pop();
8326 OrderedOps.push_back(TE);
8327 }
8328 for (TreeEntry *TE : OrderedOps) {
8329 if (!(TE->State == TreeEntry::Vectorize ||
8330 TE->State == TreeEntry::StridedVectorize ||
8331 TE->State == TreeEntry::CompressVectorize ||
8332 TE->State == TreeEntry::SplitVectorize ||
8333 (TE->isGather() && GathersToOrders.contains(TE))) ||
8334 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8335 !Visited.insert(TE).second)
8336 continue;
8337 // Build a map between user nodes and their operands order to speedup
8338 // search. The graph currently does not provide this dependency directly.
8339 Users.first = TE->UserTreeIndex.UserTE;
8340 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8341 }
8342 if (Users.first) {
8343 auto &Data = Users;
8344 if (Data.first->State == TreeEntry::SplitVectorize) {
8345 assert(
8346 Data.second.size() <= 2 &&
8347 "Expected not greater than 2 operands for split vectorize node.");
8348 if (any_of(Data.second,
8349 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8350 continue;
8351 // Update orders in user split vectorize nodes.
8352 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8353 "Expected exactly 2 entries.");
8354 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8355 TreeEntry &OpTE = *VectorizableTree[P.first];
8356 OrdersType Order = OpTE.ReorderIndices;
8357 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8358 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8359 continue;
8360 const auto BestOrder =
8361 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8362 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8363 continue;
8364 Order = *BestOrder;
8365 }
8366 fixupOrderingIndices(Order);
8367 SmallVector<int> Mask;
8368 inversePermutation(Order, Mask);
8369 const unsigned E = Order.size();
8370 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8371 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8372 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8373 });
8374 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8375 // Clear ordering of the operand.
8376 if (!OpTE.ReorderIndices.empty()) {
8377 OpTE.ReorderIndices.clear();
8378 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8379 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8380 } else {
8381 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8382 reorderScalars(OpTE.Scalars, Mask);
8383 }
8384 }
8385 if (Data.first->ReuseShuffleIndices.empty() &&
8386 !Data.first->ReorderIndices.empty()) {
8387 // Insert user node to the list to try to sink reordering deeper in
8388 // the graph.
8389 Queue.push(Data.first);
8390 }
8391 continue;
8392 }
8393 // Check that operands are used only in the User node.
8394 SmallVector<TreeEntry *> GatherOps;
8395 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8396 GatherOps);
8397 // All operands are reordered and used only in this node - propagate the
8398 // most used order to the user node.
8401 OrdersUses;
8402 // Do the analysis for each tree entry only once, otherwise the order of
8403 // the same node my be considered several times, though might be not
8404 // profitable.
8407 for (const auto &Op : Data.second) {
8408 TreeEntry *OpTE = Op.second;
8409 if (!VisitedOps.insert(OpTE).second)
8410 continue;
8411 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8412 continue;
8413 const auto Order = [&]() -> const OrdersType {
8414 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8415 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8416 IgnoreReorder)
8417 .value_or(OrdersType(1));
8418 return OpTE->ReorderIndices;
8419 }();
8420 // The order is partially ordered, skip it in favor of fully non-ordered
8421 // orders.
8422 if (Order.size() == 1)
8423 continue;
8424
8425 // Check that the reordering does not increase number of shuffles, i.e.
8426 // same-values-nodes has same parents or their parents has same parents.
8427 if (!Order.empty() && !isIdentityOrder(Order)) {
8428 Value *Root = OpTE->hasState()
8429 ? OpTE->getMainOp()
8430 : *find_if_not(OpTE->Scalars, isConstant);
8431 auto GetSameNodesUsers = [&](Value *Root) {
8433 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8434 if (TE != OpTE && TE->UserTreeIndex &&
8435 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8436 TE->Scalars.size() == OpTE->Scalars.size() &&
8437 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8438 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8439 Res.insert(TE->UserTreeIndex.UserTE);
8440 }
8441 for (const TreeEntry *TE : getTreeEntries(Root)) {
8442 if (TE != OpTE && TE->UserTreeIndex &&
8443 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8444 TE->Scalars.size() == OpTE->Scalars.size() &&
8445 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8446 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8447 Res.insert(TE->UserTreeIndex.UserTE);
8448 }
8449 return Res.takeVector();
8450 };
8451 auto GetNumOperands = [](const TreeEntry *TE) {
8452 if (TE->State == TreeEntry::SplitVectorize)
8453 return TE->getNumOperands();
8454 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8455 return CI->arg_size();
8456 return TE->getNumOperands();
8457 };
8458 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8459 const TreeEntry *TE) {
8461 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8463 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8466 continue;
8467 const TreeEntry *Op = getOperandEntry(TE, Idx);
8468 if (Op->isGather() && Op->hasState()) {
8469 const TreeEntry *VecOp =
8470 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8471 if (VecOp)
8472 Op = VecOp;
8473 }
8474 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8475 return false;
8476 }
8477 return true;
8478 };
8479 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8480 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8481 if (!RevisitedOps.insert(UTE).second)
8482 return false;
8483 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8484 !UTE->ReuseShuffleIndices.empty() ||
8485 (UTE->UserTreeIndex &&
8486 UTE->UserTreeIndex.UserTE == Data.first) ||
8487 (Data.first->UserTreeIndex &&
8488 Data.first->UserTreeIndex.UserTE == UTE) ||
8489 (IgnoreReorder && UTE->UserTreeIndex &&
8490 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8491 NodeShouldBeReorderedWithOperands(UTE);
8492 }))
8493 continue;
8494 for (TreeEntry *UTE : Users) {
8496 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8498 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8501 continue;
8502 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8503 Visited.erase(Op);
8504 Queue.push(const_cast<TreeEntry *>(Op));
8505 }
8506 }
8507 }
8508 unsigned NumOps = count_if(
8509 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
8510 return P.second == OpTE;
8511 });
8512 // Stores actually store the mask, not the order, need to invert.
8513 if (OpTE->State == TreeEntry::Vectorize &&
8514 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8515 assert(!OpTE->isAltShuffle() &&
8516 "Alternate instructions are only supported by BinaryOperator "
8517 "and CastInst.");
8518 SmallVector<int> Mask;
8519 inversePermutation(Order, Mask);
8520 unsigned E = Order.size();
8521 OrdersType CurrentOrder(E, E);
8522 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8523 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8524 });
8525 fixupOrderingIndices(CurrentOrder);
8526 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8527 } else {
8528 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8529 }
8530 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
8531 const auto AllowsReordering = [&](const TreeEntry *TE) {
8532 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8533 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8534 (IgnoreReorder && TE->Idx == 0))
8535 return true;
8536 if (TE->isGather()) {
8537 if (GathersToOrders.contains(TE))
8538 return !getReorderingData(*TE, /*TopToBottom=*/false,
8539 IgnoreReorder)
8540 .value_or(OrdersType(1))
8541 .empty();
8542 return true;
8543 }
8544 return false;
8545 };
8546 if (OpTE->UserTreeIndex) {
8547 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8548 if (!VisitedUsers.insert(UserTE).second)
8549 continue;
8550 // May reorder user node if it requires reordering, has reused
8551 // scalars, is an alternate op vectorize node or its op nodes require
8552 // reordering.
8553 if (AllowsReordering(UserTE))
8554 continue;
8555 // Check if users allow reordering.
8556 // Currently look up just 1 level of operands to avoid increase of
8557 // the compile time.
8558 // Profitable to reorder if definitely more operands allow
8559 // reordering rather than those with natural order.
8561 if (static_cast<unsigned>(count_if(
8562 Ops, [UserTE, &AllowsReordering](
8563 const std::pair<unsigned, TreeEntry *> &Op) {
8564 return AllowsReordering(Op.second) &&
8565 Op.second->UserTreeIndex.UserTE == UserTE;
8566 })) <= Ops.size() / 2)
8567 ++Res.first->second;
8568 }
8569 }
8570 if (OrdersUses.empty()) {
8571 Visited.insert_range(llvm::make_second_range(Data.second));
8572 continue;
8573 }
8574 // Choose the most used order.
8575 unsigned IdentityCnt = 0;
8576 unsigned VF = Data.second.front().second->getVectorFactor();
8577 OrdersType IdentityOrder(VF, VF);
8578 for (auto &Pair : OrdersUses) {
8579 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8580 IdentityCnt += Pair.second;
8581 combineOrders(IdentityOrder, Pair.first);
8582 }
8583 }
8584 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8585 unsigned Cnt = IdentityCnt;
8586 for (auto &Pair : OrdersUses) {
8587 // Prefer identity order. But, if filled identity found (non-empty
8588 // order) with same number of uses, as the new candidate order, we can
8589 // choose this candidate order.
8590 if (Cnt < Pair.second) {
8591 combineOrders(Pair.first, BestOrder);
8592 BestOrder = Pair.first;
8593 Cnt = Pair.second;
8594 } else {
8595 combineOrders(BestOrder, Pair.first);
8596 }
8597 }
8598 // Set order of the user node.
8599 if (isIdentityOrder(BestOrder)) {
8600 Visited.insert_range(llvm::make_second_range(Data.second));
8601 continue;
8602 }
8603 fixupOrderingIndices(BestOrder);
8604 // Erase operands from OrderedEntries list and adjust their orders.
8605 VisitedOps.clear();
8606 SmallVector<int> Mask;
8607 inversePermutation(BestOrder, Mask);
8608 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8609 unsigned E = BestOrder.size();
8610 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8611 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8612 });
8613 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
8614 TreeEntry *TE = Op.second;
8615 if (!VisitedOps.insert(TE).second)
8616 continue;
8617 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
8618 reorderNodeWithReuses(*TE, Mask);
8619 continue;
8620 }
8621 // Gathers are processed separately.
8622 if (TE->State != TreeEntry::Vectorize &&
8623 TE->State != TreeEntry::StridedVectorize &&
8624 TE->State != TreeEntry::CompressVectorize &&
8625 TE->State != TreeEntry::SplitVectorize &&
8626 (TE->State != TreeEntry::ScatterVectorize ||
8627 TE->ReorderIndices.empty()))
8628 continue;
8629 assert((BestOrder.size() == TE->ReorderIndices.size() ||
8630 TE->ReorderIndices.empty()) &&
8631 "Non-matching sizes of user/operand entries.");
8632 reorderOrder(TE->ReorderIndices, Mask);
8633 if (IgnoreReorder && TE == VectorizableTree.front().get())
8634 IgnoreReorder = false;
8635 }
8636 // For gathers just need to reorder its scalars.
8637 for (TreeEntry *Gather : GatherOps) {
8638 assert(Gather->ReorderIndices.empty() &&
8639 "Unexpected reordering of gathers.");
8640 if (!Gather->ReuseShuffleIndices.empty()) {
8641 // Just reorder reuses indices.
8642 reorderReuses(Gather->ReuseShuffleIndices, Mask);
8643 continue;
8644 }
8645 reorderScalars(Gather->Scalars, Mask);
8646 Visited.insert(Gather);
8647 }
8648 // Reorder operands of the user node and set the ordering for the user
8649 // node itself.
8650 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
8651 return TE.isAltShuffle() &&
8652 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8653 TE.ReorderIndices.empty());
8654 };
8655 if (Data.first->State != TreeEntry::Vectorize ||
8657 Data.first->getMainOp()) ||
8658 IsNotProfitableAltCodeNode(*Data.first))
8659 Data.first->reorderOperands(Mask);
8660 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
8661 IsNotProfitableAltCodeNode(*Data.first) ||
8662 Data.first->State == TreeEntry::StridedVectorize ||
8663 Data.first->State == TreeEntry::CompressVectorize) {
8664 reorderScalars(Data.first->Scalars, Mask);
8665 reorderOrder(Data.first->ReorderIndices, MaskOrder,
8666 /*BottomOrder=*/true);
8667 if (Data.first->ReuseShuffleIndices.empty() &&
8668 !Data.first->ReorderIndices.empty() &&
8669 !IsNotProfitableAltCodeNode(*Data.first)) {
8670 // Insert user node to the list to try to sink reordering deeper in
8671 // the graph.
8672 Queue.push(Data.first);
8673 }
8674 } else {
8675 reorderOrder(Data.first->ReorderIndices, Mask);
8676 }
8677 }
8678 }
8679 // If the reordering is unnecessary, just remove the reorder.
8680 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8681 VectorizableTree.front()->ReuseShuffleIndices.empty())
8682 VectorizableTree.front()->ReorderIndices.clear();
8683}
8684
8685Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
8686 if (Entry.hasState() &&
8687 (Entry.getOpcode() == Instruction::Store ||
8688 Entry.getOpcode() == Instruction::Load) &&
8689 Entry.State == TreeEntry::StridedVectorize &&
8690 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
8691 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8692 return dyn_cast<Instruction>(Entry.Scalars.front());
8693}
8694
8696 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
8697 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8698 DenseMap<Value *, unsigned> ScalarToExtUses;
8699 SmallPtrSet<Value *, 4> ExternalUsers;
8700 // Collect the values that we need to extract from the tree.
8701 for (auto &TEPtr : VectorizableTree) {
8702 TreeEntry *Entry = TEPtr.get();
8703
8704 // No need to handle users of gathered values.
8705 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8706 continue;
8707
8708 // For each lane:
8709 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8710 Value *Scalar = Entry->Scalars[Lane];
8711 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8712 continue;
8713
8714 // All uses must be replaced already? No need to do it again.
8715 auto It = ScalarToExtUses.find(Scalar);
8716 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
8717 continue;
8718
8719 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8720 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8721 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
8722 << " from " << *Scalar << "for many users.\n");
8723 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8724 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8725 ExternalUsesWithNonUsers.insert(Scalar);
8726 continue;
8727 }
8728
8729 // Check if the scalar is externally used as an extra arg.
8730 const auto ExtI = ExternallyUsedValues.find(Scalar);
8731 if (ExtI != ExternallyUsedValues.end()) {
8732 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8733 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
8734 << FoundLane << " from " << *Scalar << ".\n");
8735 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
8736 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8737 continue;
8738 }
8739 for (User *U : Scalar->users()) {
8740 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
8741
8742 Instruction *UserInst = dyn_cast<Instruction>(U);
8743 if (!UserInst || isDeleted(UserInst))
8744 continue;
8745
8746 // Ignore users in the user ignore list.
8747 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8748 continue;
8749
8750 // Skip in-tree scalars that become vectors
8751 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
8752 !UseEntries.empty()) {
8753 // Some in-tree scalars will remain as scalar in vectorized
8754 // instructions. If that is the case, the one in FoundLane will
8755 // be used.
8756 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8757 isa<LoadInst, StoreInst>(UserInst)) ||
8758 isa<CallInst>(UserInst)) ||
8759 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8760 return UseEntry->State == TreeEntry::ScatterVectorize ||
8762 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8763 TTI);
8764 })) {
8765 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
8766 << ".\n");
8767 assert(none_of(UseEntries,
8768 [](TreeEntry *UseEntry) {
8769 return UseEntry->isGather();
8770 }) &&
8771 "Bad state");
8772 continue;
8773 }
8774 U = nullptr;
8775 if (It != ScalarToExtUses.end()) {
8776 ExternalUses[It->second].User = nullptr;
8777 break;
8778 }
8779 }
8780
8781 if (U && Scalar->hasNUsesOrMore(UsesLimit))
8782 U = nullptr;
8783 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8784 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
8785 << " from lane " << FoundLane << " from " << *Scalar
8786 << ".\n");
8787 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8788 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8789 ExternalUsesWithNonUsers.insert(Scalar);
8790 if (!U)
8791 break;
8792 }
8793 }
8794 }
8795}
8796
8798BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
8801 PtrToStoresMap;
8802 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
8803 Value *V = TE->Scalars[Lane];
8804 // Don't iterate over the users of constant data.
8805 if (!isa<Instruction>(V))
8806 continue;
8807 // To save compilation time we don't visit if we have too many users.
8808 if (V->hasNUsesOrMore(UsesLimit))
8809 break;
8810
8811 // Collect stores per pointer object.
8812 for (User *U : V->users()) {
8813 auto *SI = dyn_cast<StoreInst>(U);
8814 // Test whether we can handle the store. V might be a global, which could
8815 // be used in a different function.
8816 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
8817 !isValidElementType(SI->getValueOperand()->getType()))
8818 continue;
8819 // Skip entry if already
8820 if (isVectorized(U))
8821 continue;
8822
8823 Value *Ptr =
8824 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
8825 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
8826 SI->getValueOperand()->getType(), Ptr}];
8827 // For now just keep one store per pointer object per lane.
8828 // TODO: Extend this to support multiple stores per pointer per lane
8829 if (StoresVec.size() > Lane)
8830 continue;
8831 if (!StoresVec.empty()) {
8832 std::optional<int64_t> Diff = getPointersDiff(
8833 SI->getValueOperand()->getType(), SI->getPointerOperand(),
8834 SI->getValueOperand()->getType(),
8835 StoresVec.front()->getPointerOperand(), *DL, *SE,
8836 /*StrictCheck=*/true);
8837 // We failed to compare the pointers so just abandon this store.
8838 if (!Diff)
8839 continue;
8840 }
8841 StoresVec.push_back(SI);
8842 }
8843 }
8844 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
8845 unsigned I = 0;
8846 for (auto &P : PtrToStoresMap) {
8847 Res[I].swap(P.second);
8848 ++I;
8849 }
8850 return Res;
8851}
8852
8853bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
8854 OrdersType &ReorderIndices) const {
8855 // We check whether the stores in StoreVec can form a vector by sorting them
8856 // and checking whether they are consecutive.
8857
8858 // To avoid calling getPointersDiff() while sorting we create a vector of
8859 // pairs {store, offset from first} and sort this instead.
8861 StoreInst *S0 = StoresVec[0];
8862 StoreOffsetVec.emplace_back(0, 0);
8863 Type *S0Ty = S0->getValueOperand()->getType();
8864 Value *S0Ptr = S0->getPointerOperand();
8865 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
8866 StoreInst *SI = StoresVec[Idx];
8867 std::optional<int64_t> Diff =
8868 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
8869 SI->getPointerOperand(), *DL, *SE,
8870 /*StrictCheck=*/true);
8871 StoreOffsetVec.emplace_back(*Diff, Idx);
8872 }
8873
8874 // Check if the stores are consecutive by checking if their difference is 1.
8875 if (StoreOffsetVec.size() != StoresVec.size())
8876 return false;
8877 sort(StoreOffsetVec, llvm::less_first());
8878 unsigned Idx = 0;
8879 int64_t PrevDist = 0;
8880 for (const auto &P : StoreOffsetVec) {
8881 if (Idx > 0 && P.first != PrevDist + 1)
8882 return false;
8883 PrevDist = P.first;
8884 ++Idx;
8885 }
8886
8887 // Calculate the shuffle indices according to their offset against the sorted
8888 // StoreOffsetVec.
8889 ReorderIndices.assign(StoresVec.size(), 0);
8890 bool IsIdentity = true;
8891 for (auto [I, P] : enumerate(StoreOffsetVec)) {
8892 ReorderIndices[P.second] = I;
8893 IsIdentity &= P.second == I;
8894 }
8895 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
8896 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
8897 // same convention here.
8898 if (IsIdentity)
8899 ReorderIndices.clear();
8900
8901 return true;
8902}
8903
8904#ifndef NDEBUG
8906 for (unsigned Idx : Order)
8907 dbgs() << Idx << ", ";
8908 dbgs() << "\n";
8909}
8910#endif
8911
8913BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
8914 unsigned NumLanes = TE->Scalars.size();
8915
8916 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
8917
8918 // Holds the reorder indices for each candidate store vector that is a user of
8919 // the current TreeEntry.
8920 SmallVector<OrdersType, 1> ExternalReorderIndices;
8921
8922 // Now inspect the stores collected per pointer and look for vectorization
8923 // candidates. For each candidate calculate the reorder index vector and push
8924 // it into `ExternalReorderIndices`
8925 for (ArrayRef<StoreInst *> StoresVec : Stores) {
8926 // If we have fewer than NumLanes stores, then we can't form a vector.
8927 if (StoresVec.size() != NumLanes)
8928 continue;
8929
8930 // If the stores are not consecutive then abandon this StoresVec.
8931 OrdersType ReorderIndices;
8932 if (!canFormVector(StoresVec, ReorderIndices))
8933 continue;
8934
8935 // We now know that the scalars in StoresVec can form a vector instruction,
8936 // so set the reorder indices.
8937 ExternalReorderIndices.push_back(ReorderIndices);
8938 }
8939 return ExternalReorderIndices;
8940}
8941
8943 const SmallDenseSet<Value *> &UserIgnoreLst) {
8944 deleteTree();
8945 UserIgnoreList = &UserIgnoreLst;
8946 if (!allSameType(Roots))
8947 return;
8948 buildTreeRec(Roots, 0, EdgeInfo());
8949}
8950
8952 deleteTree();
8953 if (!allSameType(Roots))
8954 return;
8955 buildTreeRec(Roots, 0, EdgeInfo());
8956}
8957
8958/// Tries to find subvector of loads and builds new vector of only loads if can
8959/// be profitable.
8961 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
8963 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
8964 bool AddNew = true) {
8965 if (VL.empty())
8966 return;
8967 Type *ScalarTy = getValueType(VL.front());
8968 if (!isValidElementType(ScalarTy))
8969 return;
8971 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
8972 for (Value *V : VL) {
8973 auto *LI = dyn_cast<LoadInst>(V);
8974 if (!LI)
8975 continue;
8976 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
8977 continue;
8978 bool IsFound = false;
8979 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
8980 assert(LI->getParent() == Data.front().first->getParent() &&
8981 LI->getType() == Data.front().first->getType() &&
8982 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
8983 getUnderlyingObject(Data.front().first->getPointerOperand(),
8985 "Expected loads with the same type, same parent and same "
8986 "underlying pointer.");
8987 std::optional<int64_t> Dist = getPointersDiff(
8988 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
8989 Data.front().first->getPointerOperand(), DL, SE,
8990 /*StrictCheck=*/true);
8991 if (!Dist)
8992 continue;
8993 auto It = Map.find(*Dist);
8994 if (It != Map.end() && It->second != LI)
8995 continue;
8996 if (It == Map.end()) {
8997 Data.emplace_back(LI, *Dist);
8998 Map.try_emplace(*Dist, LI);
8999 }
9000 IsFound = true;
9001 break;
9002 }
9003 if (!IsFound) {
9004 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9005 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9006 }
9007 }
9008 auto FindMatchingLoads =
9011 &GatheredLoads,
9012 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9013 int64_t &Offset, unsigned &Start) {
9014 if (Loads.empty())
9015 return GatheredLoads.end();
9016 LoadInst *LI = Loads.front().first;
9017 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9018 if (Idx < Start)
9019 continue;
9020 ToAdd.clear();
9021 if (LI->getParent() != Data.front().first->getParent() ||
9022 LI->getType() != Data.front().first->getType())
9023 continue;
9024 std::optional<int64_t> Dist =
9026 Data.front().first->getType(),
9027 Data.front().first->getPointerOperand(), DL, SE,
9028 /*StrictCheck=*/true);
9029 if (!Dist)
9030 continue;
9031 SmallSet<int64_t, 4> DataDists;
9033 for (std::pair<LoadInst *, int64_t> P : Data) {
9034 DataDists.insert(P.second);
9035 DataLoads.insert(P.first);
9036 }
9037 // Found matching gathered loads - check if all loads are unique or
9038 // can be effectively vectorized.
9039 unsigned NumUniques = 0;
9040 for (auto [Cnt, Pair] : enumerate(Loads)) {
9041 bool Used = DataLoads.contains(Pair.first);
9042 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9043 ++NumUniques;
9044 ToAdd.insert(Cnt);
9045 } else if (Used) {
9046 Repeated.insert(Cnt);
9047 }
9048 }
9049 if (NumUniques > 0 &&
9050 (Loads.size() == NumUniques ||
9051 (Loads.size() - NumUniques >= 2 &&
9052 Loads.size() - NumUniques >= Loads.size() / 2 &&
9053 (has_single_bit(Data.size() + NumUniques) ||
9054 bit_ceil(Data.size()) <
9055 bit_ceil(Data.size() + NumUniques))))) {
9056 Offset = *Dist;
9057 Start = Idx + 1;
9058 return std::next(GatheredLoads.begin(), Idx);
9059 }
9060 }
9061 ToAdd.clear();
9062 return GatheredLoads.end();
9063 };
9064 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9065 unsigned Start = 0;
9066 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9067 int64_t Offset = 0;
9068 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9069 Offset, Start);
9070 while (It != GatheredLoads.end()) {
9071 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9072 for (unsigned Idx : LocalToAdd)
9073 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9074 ToAdd.insert_range(LocalToAdd);
9075 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9076 Start);
9077 }
9078 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9079 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9080 })) {
9081 auto AddNewLoads =
9083 for (unsigned Idx : seq<unsigned>(Data.size())) {
9084 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9085 continue;
9086 Loads.push_back(Data[Idx]);
9087 }
9088 };
9089 if (!AddNew) {
9090 LoadInst *LI = Data.front().first;
9091 It = find_if(
9092 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9093 return PD.front().first->getParent() == LI->getParent() &&
9094 PD.front().first->getType() == LI->getType();
9095 });
9096 while (It != GatheredLoads.end()) {
9097 AddNewLoads(*It);
9098 It = std::find_if(
9099 std::next(It), GatheredLoads.end(),
9100 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9101 return PD.front().first->getParent() == LI->getParent() &&
9102 PD.front().first->getType() == LI->getType();
9103 });
9104 }
9105 }
9106 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9107 AddNewLoads(GatheredLoads.emplace_back());
9108 }
9109 }
9110}
9111
9112void BoUpSLP::tryToVectorizeGatheredLoads(
9113 const SmallMapVector<
9114 std::tuple<BasicBlock *, Value *, Type *>,
9115 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9116 &GatheredLoads) {
9117 GatheredLoadsEntriesFirst = VectorizableTree.size();
9118
9119 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9120 LoadEntriesToVectorize.size());
9121 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9122 Set.insert_range(VectorizableTree[Idx]->Scalars);
9123
9124 // Sort loads by distance.
9125 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9126 const std::pair<LoadInst *, int64_t> &L2) {
9127 return L1.second > L2.second;
9128 };
9129
9130 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9131 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9132 Loads.size());
9133 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9134 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9135 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9136 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9137 };
9138
9139 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9140 BoUpSLP::ValueSet &VectorizedLoads,
9141 SmallVectorImpl<LoadInst *> &NonVectorized,
9142 bool Final, unsigned MaxVF) {
9144 unsigned StartIdx = 0;
9145 SmallVector<int> CandidateVFs;
9146 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9147 CandidateVFs.push_back(MaxVF);
9148 for (int NumElts = getFloorFullVectorNumberOfElements(
9149 *TTI, Loads.front()->getType(), MaxVF);
9150 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9151 *TTI, Loads.front()->getType(), NumElts - 1)) {
9152 CandidateVFs.push_back(NumElts);
9153 if (VectorizeNonPowerOf2 && NumElts > 2)
9154 CandidateVFs.push_back(NumElts - 1);
9155 }
9156
9157 if (Final && CandidateVFs.empty())
9158 return Results;
9159
9160 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9161 for (unsigned NumElts : CandidateVFs) {
9162 if (Final && NumElts > BestVF)
9163 continue;
9164 SmallVector<unsigned> MaskedGatherVectorized;
9165 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9166 ++Cnt) {
9167 ArrayRef<LoadInst *> Slice =
9168 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9169 if (VectorizedLoads.count(Slice.front()) ||
9170 VectorizedLoads.count(Slice.back()) ||
9172 continue;
9173 // Check if it is profitable to try vectorizing gathered loads. It is
9174 // profitable if we have more than 3 consecutive loads or if we have
9175 // less but all users are vectorized or deleted.
9176 bool AllowToVectorize = false;
9177 // Check if it is profitable to vectorize 2-elements loads.
9178 if (NumElts == 2) {
9179 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9180 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9181 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9182 for (LoadInst *LI : Slice) {
9183 // If single use/user - allow to vectorize.
9184 if (LI->hasOneUse())
9185 continue;
9186 // 1. Check if number of uses equals number of users.
9187 // 2. All users are deleted.
9188 // 3. The load broadcasts are not allowed or the load is not
9189 // broadcasted.
9190 if (static_cast<unsigned int>(std::distance(
9191 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9192 return false;
9193 if (!IsLegalBroadcastLoad)
9194 continue;
9195 if (LI->hasNUsesOrMore(UsesLimit))
9196 return false;
9197 for (User *U : LI->users()) {
9198 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9199 continue;
9200 for (const TreeEntry *UTE : getTreeEntries(U)) {
9201 for (int I : seq<int>(UTE->getNumOperands())) {
9202 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9203 return V == LI || isa<PoisonValue>(V);
9204 }))
9205 // Found legal broadcast - do not vectorize.
9206 return false;
9207 }
9208 }
9209 }
9210 }
9211 return true;
9212 };
9213 AllowToVectorize = CheckIfAllowed(Slice);
9214 } else {
9215 AllowToVectorize =
9216 (NumElts >= 3 ||
9217 any_of(ValueToGatherNodes.at(Slice.front()),
9218 [=](const TreeEntry *TE) {
9219 return TE->Scalars.size() == 2 &&
9220 ((TE->Scalars.front() == Slice.front() &&
9221 TE->Scalars.back() == Slice.back()) ||
9222 (TE->Scalars.front() == Slice.back() &&
9223 TE->Scalars.back() == Slice.front()));
9224 })) &&
9225 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9226 Slice.size());
9227 }
9228 if (AllowToVectorize) {
9229 SmallVector<Value *> PointerOps;
9230 OrdersType CurrentOrder;
9231 // Try to build vector load.
9232 ArrayRef<Value *> Values(
9233 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9234 StridedPtrInfo SPtrInfo;
9235 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9236 PointerOps, SPtrInfo, &BestVF);
9237 if (LS != LoadsState::Gather ||
9238 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9239 if (LS == LoadsState::ScatterVectorize) {
9240 if (MaskedGatherVectorized.empty() ||
9241 Cnt >= MaskedGatherVectorized.back() + NumElts)
9242 MaskedGatherVectorized.push_back(Cnt);
9243 continue;
9244 }
9245 if (LS != LoadsState::Gather) {
9246 Results.emplace_back(Values, LS);
9247 VectorizedLoads.insert_range(Slice);
9248 // If we vectorized initial block, no need to try to vectorize it
9249 // again.
9250 if (Cnt == StartIdx)
9251 StartIdx += NumElts;
9252 }
9253 // Check if the whole array was vectorized already - exit.
9254 if (StartIdx >= Loads.size())
9255 break;
9256 // Erase last masked gather candidate, if another candidate within
9257 // the range is found to be better.
9258 if (!MaskedGatherVectorized.empty() &&
9259 Cnt < MaskedGatherVectorized.back() + NumElts)
9260 MaskedGatherVectorized.pop_back();
9261 Cnt += NumElts - 1;
9262 continue;
9263 }
9264 }
9265 if (!AllowToVectorize || BestVF == 0)
9267 }
9268 // Mark masked gathers candidates as vectorized, if any.
9269 for (unsigned Cnt : MaskedGatherVectorized) {
9270 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9271 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9272 ArrayRef<Value *> Values(
9273 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9274 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9275 VectorizedLoads.insert_range(Slice);
9276 // If we vectorized initial block, no need to try to vectorize it again.
9277 if (Cnt == StartIdx)
9278 StartIdx += NumElts;
9279 }
9280 }
9281 for (LoadInst *LI : Loads) {
9282 if (!VectorizedLoads.contains(LI))
9283 NonVectorized.push_back(LI);
9284 }
9285 return Results;
9286 };
9287 auto ProcessGatheredLoads =
9288 [&, &TTI = *TTI](
9290 bool Final = false) {
9291 SmallVector<LoadInst *> NonVectorized;
9292 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9293 GatheredLoads) {
9294 if (LoadsDists.size() <= 1) {
9295 NonVectorized.push_back(LoadsDists.back().first);
9296 continue;
9297 }
9299 LoadsDists);
9300 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9301 stable_sort(LocalLoadsDists, LoadSorter);
9303 unsigned MaxConsecutiveDistance = 0;
9304 unsigned CurrentConsecutiveDist = 1;
9305 int64_t LastDist = LocalLoadsDists.front().second;
9306 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9307 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9308 if (isVectorized(L.first))
9309 continue;
9310 assert(LastDist >= L.second &&
9311 "Expected first distance always not less than second");
9312 if (static_cast<uint64_t>(LastDist - L.second) ==
9313 CurrentConsecutiveDist) {
9314 ++CurrentConsecutiveDist;
9315 MaxConsecutiveDistance =
9316 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9317 Loads.push_back(L.first);
9318 continue;
9319 }
9320 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9321 !Loads.empty())
9322 Loads.pop_back();
9323 CurrentConsecutiveDist = 1;
9324 LastDist = L.second;
9325 Loads.push_back(L.first);
9326 }
9327 if (Loads.size() <= 1)
9328 continue;
9329 if (AllowMaskedGather)
9330 MaxConsecutiveDistance = Loads.size();
9331 else if (MaxConsecutiveDistance < 2)
9332 continue;
9333 BoUpSLP::ValueSet VectorizedLoads;
9334 SmallVector<LoadInst *> SortedNonVectorized;
9336 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9337 Final, MaxConsecutiveDistance);
9338 if (!Results.empty() && !SortedNonVectorized.empty() &&
9339 OriginalLoads.size() == Loads.size() &&
9340 MaxConsecutiveDistance == Loads.size() &&
9342 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9343 return P.second == LoadsState::ScatterVectorize;
9344 })) {
9345 VectorizedLoads.clear();
9346 SmallVector<LoadInst *> UnsortedNonVectorized;
9348 UnsortedResults =
9349 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9350 UnsortedNonVectorized, Final,
9351 OriginalLoads.size());
9352 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9353 SortedNonVectorized.swap(UnsortedNonVectorized);
9354 Results.swap(UnsortedResults);
9355 }
9356 }
9357 for (auto [Slice, _] : Results) {
9358 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9359 << Slice.size() << ")\n");
9360 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9361 for (Value *L : Slice)
9362 if (!isVectorized(L))
9363 SortedNonVectorized.push_back(cast<LoadInst>(L));
9364 continue;
9365 }
9366
9367 // Select maximum VF as a maximum of user gathered nodes and
9368 // distance between scalar loads in these nodes.
9369 unsigned MaxVF = Slice.size();
9370 unsigned UserMaxVF = 0;
9371 unsigned InterleaveFactor = 0;
9372 if (MaxVF == 2) {
9373 UserMaxVF = MaxVF;
9374 } else {
9375 // Found distance between segments of the interleaved loads.
9376 std::optional<unsigned> InterleavedLoadsDistance = 0;
9377 unsigned Order = 0;
9378 std::optional<unsigned> CommonVF = 0;
9379 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9380 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9381 for (auto [Idx, V] : enumerate(Slice)) {
9382 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9383 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9384 unsigned Pos =
9385 EntryToPosition.try_emplace(E, Idx).first->second;
9386 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9387 if (CommonVF) {
9388 if (*CommonVF == 0) {
9389 CommonVF = E->Scalars.size();
9390 continue;
9391 }
9392 if (*CommonVF != E->Scalars.size())
9393 CommonVF.reset();
9394 }
9395 // Check if the load is the part of the interleaved load.
9396 if (Pos != Idx && InterleavedLoadsDistance) {
9397 if (!DeinterleavedNodes.contains(E) &&
9398 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9399 if (isa<Constant>(V))
9400 return false;
9401 if (isVectorized(V))
9402 return true;
9403 const auto &Nodes = ValueToGatherNodes.at(V);
9404 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9405 !is_contained(Slice, V);
9406 })) {
9407 InterleavedLoadsDistance.reset();
9408 continue;
9409 }
9410 DeinterleavedNodes.insert(E);
9411 if (*InterleavedLoadsDistance == 0) {
9412 InterleavedLoadsDistance = Idx - Pos;
9413 continue;
9414 }
9415 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9416 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9417 InterleavedLoadsDistance.reset();
9418 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9419 }
9420 }
9421 }
9422 DeinterleavedNodes.clear();
9423 // Check if the large load represents interleaved load operation.
9424 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9425 CommonVF.value_or(0) != 0) {
9426 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9427 unsigned VF = *CommonVF;
9428 OrdersType Order;
9429 SmallVector<Value *> PointerOps;
9430 StridedPtrInfo SPtrInfo;
9431 // Segmented load detected - vectorize at maximum vector factor.
9432 if (InterleaveFactor <= Slice.size() &&
9433 TTI.isLegalInterleavedAccessType(
9434 getWidenedType(Slice.front()->getType(), VF),
9435 InterleaveFactor,
9436 cast<LoadInst>(Slice.front())->getAlign(),
9437 cast<LoadInst>(Slice.front())
9439 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9440 SPtrInfo) == LoadsState::Vectorize) {
9441 UserMaxVF = InterleaveFactor * VF;
9442 } else {
9443 InterleaveFactor = 0;
9444 }
9445 }
9446 // Cannot represent the loads as consecutive vectorizable nodes -
9447 // just exit.
9448 unsigned ConsecutiveNodesSize = 0;
9449 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9450 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9451 [&, Slice = Slice](const auto &P) {
9452 const auto *It = find_if(Slice, [&](Value *V) {
9453 return std::get<1>(P).contains(V);
9454 });
9455 if (It == Slice.end())
9456 return false;
9457 const TreeEntry &TE =
9458 *VectorizableTree[std::get<0>(P)];
9459 ArrayRef<Value *> VL = TE.Scalars;
9460 OrdersType Order;
9461 SmallVector<Value *> PointerOps;
9462 StridedPtrInfo SPtrInfo;
9464 VL, VL.front(), Order, PointerOps, SPtrInfo);
9465 if (State == LoadsState::ScatterVectorize ||
9467 return false;
9468 ConsecutiveNodesSize += VL.size();
9469 size_t Start = std::distance(Slice.begin(), It);
9470 size_t Sz = Slice.size() - Start;
9471 return Sz < VL.size() ||
9472 Slice.slice(Start, VL.size()) != VL;
9473 }))
9474 continue;
9475 // Try to build long masked gather loads.
9476 UserMaxVF = bit_ceil(UserMaxVF);
9477 if (InterleaveFactor == 0 &&
9478 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9479 [&, Slice = Slice](unsigned Idx) {
9480 OrdersType Order;
9481 SmallVector<Value *> PointerOps;
9482 StridedPtrInfo SPtrInfo;
9483 return canVectorizeLoads(
9484 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9485 Slice[Idx * UserMaxVF], Order, PointerOps,
9486 SPtrInfo) == LoadsState::ScatterVectorize;
9487 }))
9488 UserMaxVF = MaxVF;
9489 if (Slice.size() != ConsecutiveNodesSize)
9490 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9491 }
9492 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9493 bool IsVectorized = true;
9494 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
9495 ArrayRef<Value *> SubSlice =
9496 Slice.slice(I, std::min(VF, E - I));
9497 if (isVectorized(SubSlice.front()))
9498 continue;
9499 // Check if the subslice is to be-vectorized entry, which is not
9500 // equal to entry.
9501 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9502 [&](const auto &P) {
9503 return !SubSlice.equals(
9504 VectorizableTree[std::get<0>(P)]
9505 ->Scalars) &&
9506 set_is_subset(SubSlice, std::get<1>(P));
9507 }))
9508 continue;
9509 unsigned Sz = VectorizableTree.size();
9510 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9511 if (Sz == VectorizableTree.size()) {
9512 IsVectorized = false;
9513 // Try non-interleaved vectorization with smaller vector
9514 // factor.
9515 if (InterleaveFactor > 0) {
9516 VF = 2 * (MaxVF / InterleaveFactor);
9517 InterleaveFactor = 0;
9518 }
9519 continue;
9520 }
9521 }
9522 if (IsVectorized)
9523 break;
9524 }
9525 }
9526 NonVectorized.append(SortedNonVectorized);
9527 }
9528 return NonVectorized;
9529 };
9530 for (const auto &GLs : GatheredLoads) {
9531 const auto &Ref = GLs.second;
9532 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
9533 if (!Ref.empty() && !NonVectorized.empty() &&
9534 std::accumulate(
9535 Ref.begin(), Ref.end(), 0u,
9536 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9537 -> unsigned { return S + LoadsDists.size(); }) !=
9538 NonVectorized.size() &&
9539 IsMaskedGatherSupported(NonVectorized)) {
9541 FinalGatheredLoads;
9542 for (LoadInst *LI : NonVectorized) {
9543 // Reinsert non-vectorized loads to other list of loads with the same
9544 // base pointers.
9545 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
9546 FinalGatheredLoads,
9547 /*AddNew=*/false);
9548 }
9549 // Final attempt to vectorize non-vectorized loads.
9550 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
9551 }
9552 }
9553 // Try to vectorize postponed load entries, previously marked as gathered.
9554 for (unsigned Idx : LoadEntriesToVectorize) {
9555 const TreeEntry &E = *VectorizableTree[Idx];
9556 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
9557 // Avoid reordering, if possible.
9558 if (!E.ReorderIndices.empty()) {
9559 // Build a mask out of the reorder indices and reorder scalars per this
9560 // mask.
9561 SmallVector<int> ReorderMask;
9562 inversePermutation(E.ReorderIndices, ReorderMask);
9563 reorderScalars(GatheredScalars, ReorderMask);
9564 }
9565 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9566 }
9567 // If no new entries created, consider it as no gathered loads entries must be
9568 // handled.
9569 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9570 VectorizableTree.size())
9571 GatheredLoadsEntriesFirst.reset();
9572}
9573
9574/// Generates key/subkey pair for the given value to provide effective sorting
9575/// of the values and better detection of the vectorizable values sequences. The
9576/// keys/subkeys can be used for better sorting of the values themselves (keys)
9577/// and in values subgroups (subkeys).
9578static std::pair<size_t, size_t> generateKeySubkey(
9579 Value *V, const TargetLibraryInfo *TLI,
9580 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
9581 bool AllowAlternate) {
9582 hash_code Key = hash_value(V->getValueID() + 2);
9583 hash_code SubKey = hash_value(0);
9584 // Sort the loads by the distance between the pointers.
9585 if (auto *LI = dyn_cast<LoadInst>(V)) {
9586 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
9587 if (LI->isSimple())
9588 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
9589 else
9590 Key = SubKey = hash_value(LI);
9591 } else if (isVectorLikeInstWithConstOps(V)) {
9592 // Sort extracts by the vector operands.
9594 Key = hash_value(Value::UndefValueVal + 1);
9595 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
9596 if (!isUndefVector(EI->getVectorOperand()).all() &&
9597 !isa<UndefValue>(EI->getIndexOperand()))
9598 SubKey = hash_value(EI->getVectorOperand());
9599 }
9600 } else if (auto *I = dyn_cast<Instruction>(V)) {
9601 // Sort other instructions just by the opcodes except for CMPInst.
9602 // For CMP also sort by the predicate kind.
9604 isValidForAlternation(I->getOpcode())) {
9605 if (AllowAlternate)
9606 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
9607 else
9608 Key = hash_combine(hash_value(I->getOpcode()), Key);
9609 SubKey = hash_combine(
9610 hash_value(I->getOpcode()), hash_value(I->getType()),
9612 ? I->getType()
9613 : cast<CastInst>(I)->getOperand(0)->getType()));
9614 // For casts, look through the only operand to improve compile time.
9615 if (isa<CastInst>(I)) {
9616 std::pair<size_t, size_t> OpVals =
9617 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
9618 /*AllowAlternate=*/true);
9619 Key = hash_combine(OpVals.first, Key);
9620 SubKey = hash_combine(OpVals.first, SubKey);
9621 }
9622 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
9623 CmpInst::Predicate Pred = CI->getPredicate();
9624 if (CI->isCommutative())
9625 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
9627 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
9628 hash_value(SwapPred),
9629 hash_value(CI->getOperand(0)->getType()));
9630 } else if (auto *Call = dyn_cast<CallInst>(I)) {
9633 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
9634 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
9635 SubKey = hash_combine(hash_value(I->getOpcode()),
9636 hash_value(Call->getCalledFunction()));
9637 } else {
9639 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
9640 }
9641 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
9642 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
9643 hash_value(Op.Tag), SubKey);
9644 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
9645 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9646 SubKey = hash_value(Gep->getPointerOperand());
9647 else
9648 SubKey = hash_value(Gep);
9649 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
9650 !isa<ConstantInt>(I->getOperand(1))) {
9651 // Do not try to vectorize instructions with potentially high cost.
9652 SubKey = hash_value(I);
9653 } else {
9654 SubKey = hash_value(I->getOpcode());
9655 }
9656 Key = hash_combine(hash_value(I->getParent()), Key);
9657 }
9658 return std::make_pair(Key, SubKey);
9659}
9660
9661/// Checks if the specified instruction \p I is an main operation for the given
9662/// \p MainOp and \p AltOp instructions.
9663static bool isMainInstruction(Instruction *I, Instruction *MainOp,
9664 Instruction *AltOp, const TargetLibraryInfo &TLI);
9665
9666bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
9667 ArrayRef<Value *> VL) const {
9668 Type *ScalarTy = S.getMainOp()->getType();
9669 unsigned Opcode0 = S.getOpcode();
9670 unsigned Opcode1 = S.getAltOpcode();
9671 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9672 // If this pattern is supported by the target then consider it profitable.
9673 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
9674 Opcode1, OpcodeMask))
9675 return true;
9677 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9678 Operands.emplace_back();
9679 // Prepare the operand vector.
9680 for (Value *V : VL) {
9681 if (isa<PoisonValue>(V)) {
9682 Operands.back().push_back(
9683 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
9684 continue;
9685 }
9686 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
9687 }
9688 }
9689 if (Operands.size() == 2) {
9690 // Try find best operands candidates.
9691 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9693 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
9694 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
9695 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
9696 std::optional<int> Res = findBestRootPair(Candidates);
9697 switch (Res.value_or(0)) {
9698 case 0:
9699 break;
9700 case 1:
9701 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
9702 break;
9703 case 2:
9704 std::swap(Operands[0][I], Operands[1][I]);
9705 break;
9706 default:
9707 llvm_unreachable("Unexpected index.");
9708 }
9709 }
9710 }
9711 DenseSet<unsigned> UniqueOpcodes;
9712 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
9713 unsigned NonInstCnt = 0;
9714 // Estimate number of instructions, required for the vectorized node and for
9715 // the buildvector node.
9716 unsigned UndefCnt = 0;
9717 // Count the number of extra shuffles, required for vector nodes.
9718 unsigned ExtraShuffleInsts = 0;
9719 // Check that operands do not contain same values and create either perfect
9720 // diamond match or shuffled match.
9721 if (Operands.size() == 2) {
9722 // Do not count same operands twice.
9723 if (Operands.front() == Operands.back()) {
9724 Operands.erase(Operands.begin());
9725 } else if (!allConstant(Operands.front()) &&
9726 all_of(Operands.front(), [&](Value *V) {
9727 return is_contained(Operands.back(), V);
9728 })) {
9729 Operands.erase(Operands.begin());
9730 ++ExtraShuffleInsts;
9731 }
9732 }
9733 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
9734 // Vectorize node, if:
9735 // 1. at least single operand is constant or splat.
9736 // 2. Operands have many loop invariants (the instructions are not loop
9737 // invariants).
9738 // 3. At least single unique operands is supposed to vectorized.
9739 return none_of(Operands,
9740 [&](ArrayRef<Value *> Op) {
9741 if (allConstant(Op) ||
9742 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
9743 getSameOpcode(Op, *TLI)))
9744 return false;
9745 DenseMap<Value *, unsigned> Uniques;
9746 for (Value *V : Op) {
9748 isVectorized(V) || (L && L->isLoopInvariant(V))) {
9749 if (isa<UndefValue>(V))
9750 ++UndefCnt;
9751 continue;
9752 }
9753 auto Res = Uniques.try_emplace(V, 0);
9754 // Found first duplicate - need to add shuffle.
9755 if (!Res.second && Res.first->second == 1)
9756 ++ExtraShuffleInsts;
9757 ++Res.first->getSecond();
9758 if (auto *I = dyn_cast<Instruction>(V))
9759 UniqueOpcodes.insert(I->getOpcode());
9760 else if (Res.second)
9761 ++NonInstCnt;
9762 }
9763 return none_of(Uniques, [&](const auto &P) {
9764 return P.first->hasNUsesOrMore(P.second + 1) &&
9765 none_of(P.first->users(), [&](User *U) {
9766 return isVectorized(U) || Uniques.contains(U);
9767 });
9768 });
9769 }) ||
9770 // Do not vectorize node, if estimated number of vector instructions is
9771 // more than estimated number of buildvector instructions. Number of
9772 // vector operands is number of vector instructions + number of vector
9773 // instructions for operands (buildvectors). Number of buildvector
9774 // instructions is just number_of_operands * number_of_scalars.
9775 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9776 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
9777 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9778}
9779
9780/// Builds the arguments types vector for the given call instruction with the
9781/// given \p ID for the specified vector factor.
9784 const unsigned VF, unsigned MinBW,
9785 const TargetTransformInfo *TTI) {
9786 SmallVector<Type *> ArgTys;
9787 for (auto [Idx, Arg] : enumerate(CI->args())) {
9790 ArgTys.push_back(Arg->getType());
9791 continue;
9792 }
9793 if (MinBW > 0) {
9794 ArgTys.push_back(
9795 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9796 continue;
9797 }
9798 }
9799 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9800 }
9801 return ArgTys;
9802}
9803
9804/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
9805/// function (if possible) calls. Returns invalid cost for the corresponding
9806/// calls, if they cannot be vectorized/will be scalarized.
9807static std::pair<InstructionCost, InstructionCost>
9810 ArrayRef<Type *> ArgTys) {
9811 auto Shape = VFShape::get(CI->getFunctionType(),
9813 false /*HasGlobalPred*/);
9814 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9815 auto LibCost = InstructionCost::getInvalid();
9816 if (!CI->isNoBuiltin() && VecFunc) {
9817 // Calculate the cost of the vector library call.
9818 // If the corresponding vector call is cheaper, return its cost.
9819 LibCost =
9820 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9821 }
9823
9824 // Calculate the cost of the vector intrinsic call.
9825 FastMathFlags FMF;
9826 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9827 FMF = FPCI->getFastMathFlags();
9828 const InstructionCost ScalarLimit = 10000;
9829 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
9830 LibCost.isValid() ? LibCost : ScalarLimit);
9831 auto IntrinsicCost =
9832 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
9833 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
9834 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
9836
9837 return {IntrinsicCost, LibCost};
9838}
9839
9840BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9841 const InstructionsState &S, ArrayRef<Value *> VL,
9842 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9843 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9844 assert(S.getMainOp() &&
9845 "Expected instructions with same/alternate opcodes only.");
9846
9847 unsigned ShuffleOrOp =
9848 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9849 Instruction *VL0 = S.getMainOp();
9850 switch (ShuffleOrOp) {
9851 case Instruction::PHI: {
9852 // Too many operands - gather, most probably won't be vectorized.
9853 if (VL0->getNumOperands() > MaxPHINumOperands)
9854 return TreeEntry::NeedToGather;
9855 // Check for terminator values (e.g. invoke).
9856 for (Value *V : VL) {
9857 auto *PHI = dyn_cast<PHINode>(V);
9858 if (!PHI)
9859 continue;
9860 for (Value *Incoming : PHI->incoming_values()) {
9862 if (Term && Term->isTerminator()) {
9864 << "SLP: Need to swizzle PHINodes (terminator use).\n");
9865 return TreeEntry::NeedToGather;
9866 }
9867 }
9868 }
9869
9870 return TreeEntry::Vectorize;
9871 }
9872 case Instruction::ExtractElement:
9873 if (any_of(VL, [&](Value *V) {
9874 auto *EI = dyn_cast<ExtractElementInst>(V);
9875 if (!EI)
9876 return true;
9877 return isVectorized(EI->getOperand(0));
9878 }))
9879 return TreeEntry::NeedToGather;
9880 [[fallthrough]];
9881 case Instruction::ExtractValue: {
9882 bool Reuse = canReuseExtract(VL, CurrentOrder);
9883 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
9884 // non-full registers).
9885 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
9886 return TreeEntry::NeedToGather;
9887 if (Reuse || !CurrentOrder.empty())
9888 return TreeEntry::Vectorize;
9889 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
9890 return TreeEntry::NeedToGather;
9891 }
9892 case Instruction::InsertElement: {
9893 // Check that we have a buildvector and not a shuffle of 2 or more
9894 // different vectors.
9895 ValueSet SourceVectors;
9896 for (Value *V : VL) {
9897 if (isa<PoisonValue>(V)) {
9898 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
9899 return TreeEntry::NeedToGather;
9900 }
9901 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
9902 assert(getElementIndex(V) != std::nullopt &&
9903 "Non-constant or undef index?");
9904 }
9905
9906 if (count_if(VL, [&SourceVectors](Value *V) {
9907 return !SourceVectors.contains(V);
9908 }) >= 2) {
9909 // Found 2nd source vector - cancel.
9910 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9911 "different source vectors.\n");
9912 return TreeEntry::NeedToGather;
9913 }
9914
9915 if (any_of(VL, [&SourceVectors](Value *V) {
9916 // The last InsertElement can have multiple uses.
9917 return SourceVectors.contains(V) && !V->hasOneUse();
9918 })) {
9919 assert(SLPReVec && "Only supported by REVEC.");
9920 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9921 "multiple uses.\n");
9922 return TreeEntry::NeedToGather;
9923 }
9924
9925 return TreeEntry::Vectorize;
9926 }
9927 case Instruction::Load: {
9928 // Check that a vectorized load would load the same memory as a scalar
9929 // load. For example, we don't want to vectorize loads that are smaller
9930 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
9931 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
9932 // from such a struct, we read/write packed bits disagreeing with the
9933 // unvectorized version.
9934 auto IsGatheredNode = [&]() {
9935 if (!GatheredLoadsEntriesFirst)
9936 return false;
9937 return all_of(VL, [&](Value *V) {
9938 if (isa<PoisonValue>(V))
9939 return true;
9940 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
9941 return TE->Idx >= *GatheredLoadsEntriesFirst;
9942 });
9943 });
9944 };
9945 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
9947 return TreeEntry::Vectorize;
9949 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9950 // Delay slow vectorized nodes for better vectorization attempts.
9951 LoadEntriesToVectorize.insert(VectorizableTree.size());
9952 return TreeEntry::NeedToGather;
9953 }
9954 return IsGatheredNode() ? TreeEntry::NeedToGather
9955 : TreeEntry::CompressVectorize;
9957 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9958 // Delay slow vectorized nodes for better vectorization attempts.
9959 LoadEntriesToVectorize.insert(VectorizableTree.size());
9960 return TreeEntry::NeedToGather;
9961 }
9962 return IsGatheredNode() ? TreeEntry::NeedToGather
9963 : TreeEntry::ScatterVectorize;
9965 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9966 // Delay slow vectorized nodes for better vectorization attempts.
9967 LoadEntriesToVectorize.insert(VectorizableTree.size());
9968 return TreeEntry::NeedToGather;
9969 }
9970 return IsGatheredNode() ? TreeEntry::NeedToGather
9971 : TreeEntry::StridedVectorize;
9972 case LoadsState::Gather:
9973#ifndef NDEBUG
9974 Type *ScalarTy = VL0->getType();
9975 if (DL->getTypeSizeInBits(ScalarTy) !=
9976 DL->getTypeAllocSizeInBits(ScalarTy))
9977 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
9978 else if (any_of(VL, [](Value *V) {
9979 auto *LI = dyn_cast<LoadInst>(V);
9980 return !LI || !LI->isSimple();
9981 }))
9982 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
9983 else
9984 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
9985#endif // NDEBUG
9987 return TreeEntry::NeedToGather;
9988 }
9989 llvm_unreachable("Unexpected state of loads");
9990 }
9991 case Instruction::ZExt:
9992 case Instruction::SExt:
9993 case Instruction::FPToUI:
9994 case Instruction::FPToSI:
9995 case Instruction::FPExt:
9996 case Instruction::PtrToInt:
9997 case Instruction::IntToPtr:
9998 case Instruction::SIToFP:
9999 case Instruction::UIToFP:
10000 case Instruction::Trunc:
10001 case Instruction::FPTrunc:
10002 case Instruction::BitCast: {
10003 Type *SrcTy = VL0->getOperand(0)->getType();
10004 for (Value *V : VL) {
10005 if (isa<PoisonValue>(V))
10006 continue;
10007 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10008 if (Ty != SrcTy || !isValidElementType(Ty)) {
10009 LLVM_DEBUG(
10010 dbgs() << "SLP: Gathering casts with different src types.\n");
10011 return TreeEntry::NeedToGather;
10012 }
10013 }
10014 return TreeEntry::Vectorize;
10015 }
10016 case Instruction::ICmp:
10017 case Instruction::FCmp: {
10018 // Check that all of the compares have the same predicate.
10019 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10021 Type *ComparedTy = VL0->getOperand(0)->getType();
10022 for (Value *V : VL) {
10023 if (isa<PoisonValue>(V))
10024 continue;
10025 auto *Cmp = cast<CmpInst>(V);
10026 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10027 Cmp->getOperand(0)->getType() != ComparedTy) {
10028 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10029 return TreeEntry::NeedToGather;
10030 }
10031 }
10032 return TreeEntry::Vectorize;
10033 }
10034 case Instruction::Select:
10035 case Instruction::FNeg:
10036 case Instruction::Add:
10037 case Instruction::FAdd:
10038 case Instruction::Sub:
10039 case Instruction::FSub:
10040 case Instruction::Mul:
10041 case Instruction::FMul:
10042 case Instruction::UDiv:
10043 case Instruction::SDiv:
10044 case Instruction::FDiv:
10045 case Instruction::URem:
10046 case Instruction::SRem:
10047 case Instruction::FRem:
10048 case Instruction::Shl:
10049 case Instruction::LShr:
10050 case Instruction::AShr:
10051 case Instruction::And:
10052 case Instruction::Or:
10053 case Instruction::Xor:
10054 case Instruction::Freeze:
10055 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10056 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10057 auto *I = dyn_cast<Instruction>(V);
10058 return I && I->isBinaryOp() && !I->isFast();
10059 }))
10060 return TreeEntry::NeedToGather;
10061 return TreeEntry::Vectorize;
10062 case Instruction::GetElementPtr: {
10063 // We don't combine GEPs with complicated (nested) indexing.
10064 for (Value *V : VL) {
10065 auto *I = dyn_cast<GetElementPtrInst>(V);
10066 if (!I)
10067 continue;
10068 if (I->getNumOperands() != 2) {
10069 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10070 return TreeEntry::NeedToGather;
10071 }
10072 }
10073
10074 // We can't combine several GEPs into one vector if they operate on
10075 // different types.
10076 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10077 for (Value *V : VL) {
10078 auto *GEP = dyn_cast<GEPOperator>(V);
10079 if (!GEP)
10080 continue;
10081 Type *CurTy = GEP->getSourceElementType();
10082 if (Ty0 != CurTy) {
10083 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10084 return TreeEntry::NeedToGather;
10085 }
10086 }
10087
10088 // We don't combine GEPs with non-constant indexes.
10089 Type *Ty1 = VL0->getOperand(1)->getType();
10090 for (Value *V : VL) {
10091 auto *I = dyn_cast<GetElementPtrInst>(V);
10092 if (!I)
10093 continue;
10094 auto *Op = I->getOperand(1);
10095 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10096 (Op->getType() != Ty1 &&
10097 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10098 Op->getType()->getScalarSizeInBits() >
10099 DL->getIndexSizeInBits(
10100 V->getType()->getPointerAddressSpace())))) {
10101 LLVM_DEBUG(
10102 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10103 return TreeEntry::NeedToGather;
10104 }
10105 }
10106
10107 return TreeEntry::Vectorize;
10108 }
10109 case Instruction::Store: {
10110 // Check if the stores are consecutive or if we need to swizzle them.
10111 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10112 // Avoid types that are padded when being allocated as scalars, while
10113 // being packed together in a vector (such as i1).
10114 if (DL->getTypeSizeInBits(ScalarTy) !=
10115 DL->getTypeAllocSizeInBits(ScalarTy)) {
10116 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10117 return TreeEntry::NeedToGather;
10118 }
10119 // Make sure all stores in the bundle are simple - we can't vectorize
10120 // atomic or volatile stores.
10121 for (Value *V : VL) {
10122 auto *SI = cast<StoreInst>(V);
10123 if (!SI->isSimple()) {
10124 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10125 return TreeEntry::NeedToGather;
10126 }
10127 PointerOps.push_back(SI->getPointerOperand());
10128 }
10129
10130 // Check the order of pointer operands.
10131 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10132 Value *Ptr0;
10133 Value *PtrN;
10134 if (CurrentOrder.empty()) {
10135 Ptr0 = PointerOps.front();
10136 PtrN = PointerOps.back();
10137 } else {
10138 Ptr0 = PointerOps[CurrentOrder.front()];
10139 PtrN = PointerOps[CurrentOrder.back()];
10140 }
10141 std::optional<int64_t> Dist =
10142 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10143 // Check that the sorted pointer operands are consecutive.
10144 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10145 return TreeEntry::Vectorize;
10146 }
10147
10148 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10149 return TreeEntry::NeedToGather;
10150 }
10151 case Instruction::Call: {
10152 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10153 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10154 auto *I = dyn_cast<Instruction>(V);
10155 return I && !I->isFast();
10156 }))
10157 return TreeEntry::NeedToGather;
10158 // Check if the calls are all to the same vectorizable intrinsic or
10159 // library function.
10160 CallInst *CI = cast<CallInst>(VL0);
10162
10163 VFShape Shape = VFShape::get(
10164 CI->getFunctionType(),
10165 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10166 false /*HasGlobalPred*/);
10167 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10168
10169 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10170 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10171 return TreeEntry::NeedToGather;
10172 }
10173 Function *F = CI->getCalledFunction();
10174 unsigned NumArgs = CI->arg_size();
10175 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10176 for (unsigned J = 0; J != NumArgs; ++J)
10178 ScalarArgs[J] = CI->getArgOperand(J);
10179 for (Value *V : VL) {
10180 CallInst *CI2 = dyn_cast<CallInst>(V);
10181 if (!CI2 || CI2->getCalledFunction() != F ||
10182 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10183 (VecFunc &&
10184 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10186 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10187 << "\n");
10188 return TreeEntry::NeedToGather;
10189 }
10190 // Some intrinsics have scalar arguments and should be same in order for
10191 // them to be vectorized.
10192 for (unsigned J = 0; J != NumArgs; ++J) {
10194 Value *A1J = CI2->getArgOperand(J);
10195 if (ScalarArgs[J] != A1J) {
10197 << "SLP: mismatched arguments in call:" << *CI
10198 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10199 return TreeEntry::NeedToGather;
10200 }
10201 }
10202 }
10203 // Verify that the bundle operands are identical between the two calls.
10204 if (CI->hasOperandBundles() &&
10205 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10206 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10207 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10208 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10209 << "!=" << *V << '\n');
10210 return TreeEntry::NeedToGather;
10211 }
10212 }
10213 SmallVector<Type *> ArgTys =
10214 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10215 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10216 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10217 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10218 return TreeEntry::NeedToGather;
10219
10220 return TreeEntry::Vectorize;
10221 }
10222 case Instruction::ShuffleVector: {
10223 if (!S.isAltShuffle()) {
10224 // REVEC can support non alternate shuffle.
10226 return TreeEntry::Vectorize;
10227 // If this is not an alternate sequence of opcode like add-sub
10228 // then do not vectorize this instruction.
10229 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10230 return TreeEntry::NeedToGather;
10231 }
10232 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10233 LLVM_DEBUG(
10234 dbgs()
10235 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10236 "the whole alt sequence is not profitable.\n");
10237 return TreeEntry::NeedToGather;
10238 }
10239
10240 return TreeEntry::Vectorize;
10241 }
10242 default:
10243 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10244 return TreeEntry::NeedToGather;
10245 }
10246}
10247
10248namespace {
10249/// Allows to correctly handle operands of the phi nodes based on the \p Main
10250/// PHINode order of incoming basic blocks/values.
10251class PHIHandler {
10252 DominatorTree &DT;
10253 PHINode *Main = nullptr;
10256
10257public:
10258 PHIHandler() = delete;
10259 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10260 : DT(DT), Main(Main), Phis(Phis),
10261 Operands(Main->getNumIncomingValues(),
10262 SmallVector<Value *>(Phis.size(), nullptr)) {}
10263 void buildOperands() {
10264 constexpr unsigned FastLimit = 4;
10265 if (Main->getNumIncomingValues() <= FastLimit) {
10266 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10267 BasicBlock *InBB = Main->getIncomingBlock(I);
10268 if (!DT.isReachableFromEntry(InBB)) {
10269 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10270 continue;
10271 }
10272 // Prepare the operand vector.
10273 for (auto [Idx, V] : enumerate(Phis)) {
10274 auto *P = dyn_cast<PHINode>(V);
10275 if (!P) {
10277 "Expected isa instruction or poison value.");
10278 Operands[I][Idx] = V;
10279 continue;
10280 }
10281 if (P->getIncomingBlock(I) == InBB)
10282 Operands[I][Idx] = P->getIncomingValue(I);
10283 else
10284 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10285 }
10286 }
10287 return;
10288 }
10289 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10290 Blocks;
10291 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10292 BasicBlock *InBB = Main->getIncomingBlock(I);
10293 if (!DT.isReachableFromEntry(InBB)) {
10294 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10295 continue;
10296 }
10297 Blocks.try_emplace(InBB).first->second.push_back(I);
10298 }
10299 for (auto [Idx, V] : enumerate(Phis)) {
10300 if (isa<PoisonValue>(V)) {
10301 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10302 Operands[I][Idx] = V;
10303 continue;
10304 }
10305 auto *P = cast<PHINode>(V);
10306 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10307 BasicBlock *InBB = P->getIncomingBlock(I);
10308 if (InBB == Main->getIncomingBlock(I)) {
10310 continue;
10311 Operands[I][Idx] = P->getIncomingValue(I);
10312 continue;
10313 }
10314 auto *It = Blocks.find(InBB);
10315 if (It == Blocks.end())
10316 continue;
10317 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10318 }
10319 }
10320 for (const auto &P : Blocks) {
10321 ArrayRef<unsigned> IncomingValues = P.second;
10322 if (IncomingValues.size() <= 1)
10323 continue;
10324 unsigned BasicI = IncomingValues.consume_front();
10325 for (unsigned I : IncomingValues) {
10327 [&](const auto &Data) {
10328 return !Data.value() ||
10329 Data.value() == Operands[BasicI][Data.index()];
10330 }) &&
10331 "Expected empty operands list.");
10332 Operands[I] = Operands[BasicI];
10333 }
10334 }
10335 }
10336 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10337};
10338} // namespace
10339
10340/// Returns main/alternate instructions for the given \p VL. Unlike
10341/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10342/// node support.
10343/// \returns first main/alt instructions, if only poisons and instruction with
10344/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10345static std::pair<Instruction *, Instruction *>
10347 Instruction *MainOp = nullptr;
10348 Instruction *AltOp = nullptr;
10349 for (Value *V : VL) {
10350 if (isa<PoisonValue>(V))
10351 continue;
10352 auto *I = dyn_cast<Instruction>(V);
10353 if (!I)
10354 return {};
10355 if (!MainOp) {
10356 MainOp = I;
10357 continue;
10358 }
10359 if (MainOp->getOpcode() == I->getOpcode()) {
10360 if (I->getParent() != MainOp->getParent())
10361 return {};
10362 continue;
10363 }
10364 if (!AltOp) {
10365 AltOp = I;
10366 continue;
10367 }
10368 if (AltOp->getOpcode() == I->getOpcode()) {
10369 if (I->getParent() != AltOp->getParent())
10370 return {};
10371 continue;
10372 }
10373 return {};
10374 }
10375 if (!AltOp)
10376 return {};
10377 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10378 "Expected different main and alt instructions.");
10379 return std::make_pair(MainOp, AltOp);
10380}
10381
10382/// Checks that every instruction appears once in the list and if not, packs
10383/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10384/// unique scalars is extended by poison values to the whole register size.
10385///
10386/// \returns false if \p VL could not be uniquified, in which case \p VL is
10387/// unchanged and \p ReuseShuffleIndices is empty.
10389 SmallVectorImpl<int> &ReuseShuffleIndices,
10390 const TargetTransformInfo &TTI,
10391 const TargetLibraryInfo &TLI,
10392 const InstructionsState &S,
10393 const BoUpSLP::EdgeInfo &UserTreeIdx,
10394 bool TryPad = false) {
10395 // Check that every instruction appears once in this bundle.
10396 SmallVector<Value *> UniqueValues;
10397 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10398 for (Value *V : VL) {
10399 if (isConstant(V)) {
10400 // Constants are always considered distinct, even if the same constant
10401 // appears multiple times in VL.
10402 ReuseShuffleIndices.emplace_back(
10403 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10404 UniqueValues.emplace_back(V);
10405 continue;
10406 }
10407 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10408 ReuseShuffleIndices.emplace_back(Res.first->second);
10409 if (Res.second)
10410 UniqueValues.emplace_back(V);
10411 }
10412
10413 // Easy case: VL has unique values and a "natural" size
10414 size_t NumUniqueScalarValues = UniqueValues.size();
10415 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10416 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10417 if (NumUniqueScalarValues == VL.size() &&
10418 (VectorizeNonPowerOf2 || IsFullVectors)) {
10419 ReuseShuffleIndices.clear();
10420 return true;
10421 }
10422
10423 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10424 if ((UserTreeIdx.UserTE &&
10425 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10427 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10428 "for nodes with padding.\n");
10429 ReuseShuffleIndices.clear();
10430 return false;
10431 }
10432
10433 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10434 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10435 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10436 return isa<UndefValue>(V) || !isConstant(V);
10437 }))) {
10438 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10439 S.getMainOp()->isSafeToRemove() &&
10440 (S.areInstructionsWithCopyableElements() ||
10441 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10442 // Find the number of elements, which forms full vectors.
10443 unsigned PWSz = getFullVectorNumberOfElements(
10444 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10445 PWSz = std::min<unsigned>(PWSz, VL.size());
10446 if (PWSz == VL.size()) {
10447 // We ended up with the same size after removing duplicates and
10448 // upgrading the resulting vector size to a "nice size". Just keep
10449 // the initial VL then.
10450 ReuseShuffleIndices.clear();
10451 } else {
10452 // Pad unique values with poison to grow the vector to a "nice" size
10453 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10454 UniqueValues.end());
10455 PaddedUniqueValues.append(
10456 PWSz - UniqueValues.size(),
10457 PoisonValue::get(UniqueValues.front()->getType()));
10458 // Check that extended with poisons/copyable operations are still valid
10459 // for vectorization (div/rem are not allowed).
10460 if (!S.areInstructionsWithCopyableElements() &&
10461 !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
10462 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10463 ReuseShuffleIndices.clear();
10464 return false;
10465 }
10466 VL = std::move(PaddedUniqueValues);
10467 }
10468 return true;
10469 }
10470 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10471 ReuseShuffleIndices.clear();
10472 return false;
10473 }
10474 VL = std::move(UniqueValues);
10475 return true;
10476}
10477
10478bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10479 const InstructionsState &LocalState,
10480 SmallVectorImpl<Value *> &Op1,
10481 SmallVectorImpl<Value *> &Op2,
10482 OrdersType &ReorderIndices) const {
10483 constexpr unsigned SmallNodeSize = 4;
10484 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10486 return false;
10487
10488 // Check if this is a duplicate of another split entry.
10489 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10490 << ".\n");
10491 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10492 if (E->isSame(VL)) {
10493 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10494 << *LocalState.getMainOp() << ".\n");
10495 return false;
10496 }
10497 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10498 if (all_of(VL, [&](Value *V) {
10499 return isa<PoisonValue>(V) || Values.contains(V);
10500 })) {
10501 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10502 return false;
10503 }
10504 }
10505
10506 ReorderIndices.assign(VL.size(), VL.size());
10507 SmallBitVector Op1Indices(VL.size());
10508 for (auto [Idx, V] : enumerate(VL)) {
10509 auto *I = dyn_cast<Instruction>(V);
10510 if (!I) {
10511 Op1.push_back(V);
10512 Op1Indices.set(Idx);
10513 continue;
10514 }
10515 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10516 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
10517 *TLI)) ||
10518 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10519 !isAlternateInstruction(I, LocalState.getMainOp(),
10520 LocalState.getAltOp(), *TLI))) {
10521 Op1.push_back(V);
10522 Op1Indices.set(Idx);
10523 continue;
10524 }
10525 Op2.push_back(V);
10526 }
10527 Type *ScalarTy = getValueType(VL.front());
10528 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
10529 unsigned Opcode0 = LocalState.getOpcode();
10530 unsigned Opcode1 = LocalState.getAltOpcode();
10531 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10532 // Enable split node, only if all nodes do not form legal alternate
10533 // instruction (like X86 addsub).
10534 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
10535 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
10536 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10537 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10538 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
10539 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
10540 return false;
10541 // Enable split node, only if all nodes are power-of-2/full registers.
10542 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
10543 for (unsigned Idx : seq<unsigned>(VL.size())) {
10544 if (Op1Indices.test(Idx)) {
10545 ReorderIndices[Op1Cnt] = Idx;
10546 ++Op1Cnt;
10547 } else {
10548 ReorderIndices[Op2Cnt] = Idx;
10549 ++Op2Cnt;
10550 }
10551 }
10552 if (isIdentityOrder(ReorderIndices))
10553 ReorderIndices.clear();
10554 SmallVector<int> Mask;
10555 if (!ReorderIndices.empty())
10556 inversePermutation(ReorderIndices, Mask);
10557 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10558 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
10559 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
10560 // Check non-profitable single register ops, which better to be represented
10561 // as alternate ops.
10562 if (NumParts >= VL.size())
10563 return false;
10565 InstructionCost InsertCost = ::getShuffleCost(
10566 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
10567 FixedVectorType *SubVecTy =
10568 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
10569 InstructionCost NewShuffleCost =
10570 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
10571 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10572 (Mask.empty() || InsertCost >= NewShuffleCost))
10573 return false;
10574 if ((LocalState.getMainOp()->isBinaryOp() &&
10575 LocalState.getAltOp()->isBinaryOp() &&
10576 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10577 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10578 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10579 (LocalState.getMainOp()->isUnaryOp() &&
10580 LocalState.getAltOp()->isUnaryOp())) {
10581 InstructionCost OriginalVecOpsCost =
10582 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10583 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10584 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
10585 for (unsigned Idx : seq<unsigned>(VL.size())) {
10586 if (isa<PoisonValue>(VL[Idx]))
10587 continue;
10588 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
10589 }
10590 InstructionCost OriginalCost =
10591 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
10592 VecTy, OriginalMask, Kind);
10593 InstructionCost NewVecOpsCost =
10594 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10595 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10596 InstructionCost NewCost =
10597 NewVecOpsCost + InsertCost +
10598 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10599 VectorizableTree.front()->getOpcode() == Instruction::Store
10600 ? NewShuffleCost
10601 : 0);
10602 // If not profitable to split - exit.
10603 if (NewCost >= OriginalCost)
10604 return false;
10605 }
10606 return true;
10607}
10608
10609namespace {
10610/// Class accepts incoming list of values, checks if it is able to model
10611/// "copyable" values as compatible operations, and generates the list of values
10612/// for scheduling and list of operands doe the new nodes.
10613class InstructionsCompatibilityAnalysis {
10614 DominatorTree &DT;
10615 const DataLayout &DL;
10616 const TargetTransformInfo &TTI;
10617 const TargetLibraryInfo &TLI;
10618 unsigned MainOpcode = 0;
10619 Instruction *MainOp = nullptr;
10620
10621 /// Checks if the opcode is supported as the main opcode for copyable
10622 /// elements.
10623 static bool isSupportedOpcode(const unsigned Opcode) {
10624 return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10625 }
10626
10627 /// Identifies the best candidate value, which represents main opcode
10628 /// operation.
10629 /// Currently the best candidate is the Add instruction with the parent
10630 /// block with the highest DFS incoming number (block, that dominates other).
10631 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
10632 BasicBlock *Parent = nullptr;
10633 // Checks if the instruction has supported opcode.
10634 auto IsSupportedInstruction = [&](Instruction *I) {
10635 return I && isSupportedOpcode(I->getOpcode()) &&
10636 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
10637 };
10638 // Exclude operands instructions immediately to improve compile time, it
10639 // will be unable to schedule anyway.
10640 SmallDenseSet<Value *, 8> Operands;
10641 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10642 for (Value *V : VL) {
10643 auto *I = dyn_cast<Instruction>(V);
10644 if (!I)
10645 continue;
10646 if (!DT.isReachableFromEntry(I->getParent()))
10647 continue;
10648 if (Candidates.empty()) {
10649 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10650 Parent = I->getParent();
10651 Operands.insert(I->op_begin(), I->op_end());
10652 continue;
10653 }
10654 if (Parent == I->getParent()) {
10655 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10656 Operands.insert(I->op_begin(), I->op_end());
10657 continue;
10658 }
10659 auto *NodeA = DT.getNode(Parent);
10660 auto *NodeB = DT.getNode(I->getParent());
10661 assert(NodeA && "Should only process reachable instructions");
10662 assert(NodeB && "Should only process reachable instructions");
10663 assert((NodeA == NodeB) ==
10664 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10665 "Different nodes should have different DFS numbers");
10666 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10667 Candidates.clear();
10668 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10669 Parent = I->getParent();
10670 Operands.clear();
10671 Operands.insert(I->op_begin(), I->op_end());
10672 }
10673 }
10674 unsigned BestOpcodeNum = 0;
10675 MainOp = nullptr;
10676 for (const auto &P : Candidates) {
10677 if (P.second.size() < BestOpcodeNum)
10678 continue;
10679 for (Instruction *I : P.second) {
10680 if (IsSupportedInstruction(I) && !Operands.contains(I)) {
10681 MainOp = I;
10682 BestOpcodeNum = P.second.size();
10683 break;
10684 }
10685 }
10686 }
10687 if (MainOp) {
10688 // Do not match, if any copyable is a terminator from the same block as
10689 // the main operation.
10690 if (any_of(VL, [&](Value *V) {
10691 auto *I = dyn_cast<Instruction>(V);
10692 return I && I->getParent() == MainOp->getParent() &&
10693 I->isTerminator();
10694 })) {
10695 MainOp = nullptr;
10696 return;
10697 }
10698 MainOpcode = MainOp->getOpcode();
10699 }
10700 }
10701
10702 /// Returns the idempotent value for the \p MainOp with the detected \p
10703 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
10704 /// the operand itself, since V or V == V.
10705 Value *selectBestIdempotentValue() const {
10706 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10707 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
10708 !MainOp->isCommutative());
10709 }
10710
10711 /// Returns the value and operands for the \p V, considering if it is original
10712 /// instruction and its actual operands should be returned, or it is a
10713 /// copyable element and its should be represented as idempotent instruction.
10714 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
10715 if (isa<PoisonValue>(V))
10716 return {V, V};
10717 if (!S.isCopyableElement(V))
10718 return convertTo(cast<Instruction>(V), S).second;
10719 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10720 return {V, selectBestIdempotentValue()};
10721 }
10722
10723 /// Builds operands for the original instructions.
10724 void
10725 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
10726 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
10727
10728 unsigned ShuffleOrOp =
10729 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10730 Instruction *VL0 = S.getMainOp();
10731
10732 switch (ShuffleOrOp) {
10733 case Instruction::PHI: {
10734 auto *PH = cast<PHINode>(VL0);
10735
10736 // Keeps the reordered operands to avoid code duplication.
10737 PHIHandler Handler(DT, PH, VL);
10738 Handler.buildOperands();
10739 Operands.assign(PH->getNumOperands(), {});
10740 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
10741 Operands[I].assign(Handler.getOperands(I).begin(),
10742 Handler.getOperands(I).end());
10743 return;
10744 }
10745 case Instruction::ExtractValue:
10746 case Instruction::ExtractElement:
10747 // This is a special case, as it does not gather, but at the same time
10748 // we are not extending buildTree_rec() towards the operands.
10749 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
10750 return;
10751 case Instruction::InsertElement:
10752 Operands.assign(2, {VL.size(), nullptr});
10753 for (auto [Idx, V] : enumerate(VL)) {
10754 auto *IE = cast<InsertElementInst>(V);
10755 for (auto [OpIdx, Ops] : enumerate(Operands))
10756 Ops[Idx] = IE->getOperand(OpIdx);
10757 }
10758 return;
10759 case Instruction::Load:
10760 Operands.assign(
10761 1, {VL.size(),
10762 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
10763 for (auto [V, Op] : zip(VL, Operands.back())) {
10764 auto *LI = dyn_cast<LoadInst>(V);
10765 if (!LI)
10766 continue;
10767 Op = LI->getPointerOperand();
10768 }
10769 return;
10770 case Instruction::ZExt:
10771 case Instruction::SExt:
10772 case Instruction::FPToUI:
10773 case Instruction::FPToSI:
10774 case Instruction::FPExt:
10775 case Instruction::PtrToInt:
10776 case Instruction::IntToPtr:
10777 case Instruction::SIToFP:
10778 case Instruction::UIToFP:
10779 case Instruction::Trunc:
10780 case Instruction::FPTrunc:
10781 case Instruction::BitCast:
10782 case Instruction::ICmp:
10783 case Instruction::FCmp:
10784 case Instruction::Select:
10785 case Instruction::FNeg:
10786 case Instruction::Add:
10787 case Instruction::FAdd:
10788 case Instruction::Sub:
10789 case Instruction::FSub:
10790 case Instruction::Mul:
10791 case Instruction::FMul:
10792 case Instruction::UDiv:
10793 case Instruction::SDiv:
10794 case Instruction::FDiv:
10795 case Instruction::URem:
10796 case Instruction::SRem:
10797 case Instruction::FRem:
10798 case Instruction::Shl:
10799 case Instruction::LShr:
10800 case Instruction::AShr:
10801 case Instruction::And:
10802 case Instruction::Or:
10803 case Instruction::Xor:
10804 case Instruction::Freeze:
10805 case Instruction::Store:
10806 case Instruction::ShuffleVector:
10807 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
10808 for (auto [Idx, V] : enumerate(VL)) {
10809 auto *I = dyn_cast<Instruction>(V);
10810 if (!I) {
10811 for (auto [OpIdx, Ops] : enumerate(Operands))
10812 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
10813 continue;
10814 }
10815 auto [Op, ConvertedOps] = convertTo(I, S);
10816 for (auto [OpIdx, Ops] : enumerate(Operands))
10817 Ops[Idx] = ConvertedOps[OpIdx];
10818 }
10819 return;
10820 case Instruction::GetElementPtr: {
10821 Operands.assign(2, {VL.size(), nullptr});
10822 // Need to cast all indices to the same type before vectorization to
10823 // avoid crash.
10824 // Required to be able to find correct matches between different gather
10825 // nodes and reuse the vectorized values rather than trying to gather them
10826 // again.
10827 const unsigned IndexIdx = 1;
10828 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
10829 Type *Ty =
10830 all_of(VL,
10831 [&](Value *V) {
10833 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
10834 })
10835 ? VL0Ty
10836 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
10837 ->getPointerOperandType()
10838 ->getScalarType());
10839 for (auto [Idx, V] : enumerate(VL)) {
10841 if (!GEP) {
10842 Operands[0][Idx] = V;
10843 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10844 continue;
10845 }
10846 Operands[0][Idx] = GEP->getPointerOperand();
10847 auto *Op = GEP->getOperand(IndexIdx);
10848 auto *CI = dyn_cast<ConstantInt>(Op);
10849 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
10850 CI, Ty, CI->getValue().isSignBitSet(), DL)
10851 : Op;
10852 }
10853 return;
10854 }
10855 case Instruction::Call: {
10856 auto *CI = cast<CallInst>(VL0);
10858 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
10860 continue;
10861 auto &Ops = Operands.emplace_back();
10862 for (Value *V : VL) {
10863 auto *I = dyn_cast<Instruction>(V);
10864 Ops.push_back(I ? I->getOperand(Idx)
10865 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
10866 }
10867 }
10868 return;
10869 }
10870 default:
10871 break;
10872 }
10873 llvm_unreachable("Unexpected vectorization of the instructions.");
10874 }
10875
10876public:
10877 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
10878 const TargetTransformInfo &TTI,
10879 const TargetLibraryInfo &TLI)
10880 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
10881
10882 InstructionsState
10883 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
10884 bool TryCopyableElementsVectorization,
10885 bool WithProfitabilityCheck = false,
10886 bool SkipSameCodeCheck = false) {
10887 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
10888 ? InstructionsState::invalid()
10889 : getSameOpcode(VL, TLI);
10890 if (S)
10891 return S;
10892 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
10893 return S;
10894 findAndSetMainInstruction(VL, R);
10895 if (!MainOp)
10896 return InstructionsState::invalid();
10897 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
10898 if (!WithProfitabilityCheck)
10899 return S;
10900 // Check if it is profitable to vectorize the instruction.
10901 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
10902 auto BuildCandidates =
10903 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
10904 Value *V2) {
10905 if (V1 != V2 && isa<PHINode>(V1))
10906 return;
10907 auto *I1 = dyn_cast<Instruction>(V1);
10908 auto *I2 = dyn_cast<Instruction>(V2);
10909 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
10910 I1->getParent() != I2->getParent())
10911 return;
10912 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
10913 };
10914 if (VL.size() == 2) {
10915 // Check if the operands allow better vectorization.
10916 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
10917 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
10918 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
10919 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
10920 R.findBestRootPair(Candidates1) &&
10921 R.findBestRootPair(Candidates2);
10922 if (!Res && isCommutative(MainOp)) {
10923 Candidates1.clear();
10924 Candidates2.clear();
10925 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
10926 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
10927 Res = !Candidates1.empty() && !Candidates2.empty() &&
10928 R.findBestRootPair(Candidates1) &&
10929 R.findBestRootPair(Candidates2);
10930 }
10931 if (!Res)
10932 return InstructionsState::invalid();
10934 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
10935 InstructionCost VectorCost;
10936 FixedVectorType *VecTy =
10937 getWidenedType(S.getMainOp()->getType(), VL.size());
10938 switch (MainOpcode) {
10939 case Instruction::Add:
10940 case Instruction::LShr:
10941 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
10942 break;
10943 default:
10944 llvm_unreachable("Unexpected instruction.");
10945 }
10946 if (VectorCost > ScalarCost)
10947 return InstructionsState::invalid();
10948 return S;
10949 }
10950 assert(Operands.size() == 2 && "Unexpected number of operands!");
10951 unsigned CopyableNum =
10952 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
10953 if (CopyableNum < VL.size() / 2)
10954 return S;
10955 // Too many phi copyables - exit.
10956 const unsigned Limit = VL.size() / 24;
10957 if ((CopyableNum >= VL.size() - Limit ||
10958 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
10959 CopyableNum >= MaxPHINumOperands) &&
10960 all_of(VL, [&](Value *V) {
10961 return isa<PHINode>(V) || !S.isCopyableElement(V);
10962 }))
10963 return InstructionsState::invalid();
10964 // Check profitability if number of copyables > VL.size() / 2.
10965 // 1. Reorder operands for better matching.
10966 if (isCommutative(MainOp)) {
10967 for (auto &Ops : Operands) {
10968 // Make instructions the first operands.
10969 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
10970 std::swap(Ops.front(), Ops.back());
10971 continue;
10972 }
10973 // Make constants the second operands.
10974 if (isa<Constant>(Ops.front())) {
10975 std::swap(Ops.front(), Ops.back());
10976 continue;
10977 }
10978 }
10979 }
10980 // 2. Check, if operands can be vectorized.
10981 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
10982 return InstructionsState::invalid();
10983 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
10984 if (allConstant(Ops) || isSplat(Ops))
10985 return true;
10986 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
10987 // one is different.
10988 constexpr unsigned Limit = 4;
10989 if (Operands.front().size() >= Limit) {
10990 SmallDenseMap<const Value *, unsigned> Counters;
10991 for (Value *V : Ops) {
10992 if (isa<UndefValue>(V))
10993 continue;
10994 ++Counters[V];
10995 }
10996 if (Counters.size() == 2 &&
10997 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
10998 return C.second == 1;
10999 }))
11000 return true;
11001 }
11002 // First operand not a constant or splat? Last attempt - check for
11003 // potential vectorization.
11004 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11005 InstructionsState OpS = Analysis.buildInstructionsState(
11006 Ops, R, /*TryCopyableElementsVectorization=*/true);
11007 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11008 return false;
11009 unsigned CopyableNum =
11010 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11011 return CopyableNum <= VL.size() / 2;
11012 };
11013 if (!CheckOperand(Operands.front()))
11014 return InstructionsState::invalid();
11015
11016 return S;
11017 }
11018
11019 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11020 ArrayRef<Value *> VL) {
11021 assert(S && "Invalid state!");
11023 if (S.areInstructionsWithCopyableElements()) {
11024 MainOp = S.getMainOp();
11025 MainOpcode = S.getOpcode();
11026 Operands.assign(MainOp->getNumOperands(),
11027 BoUpSLP::ValueList(VL.size(), nullptr));
11028 for (auto [Idx, V] : enumerate(VL)) {
11029 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11030 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11031 Operands[OperandIdx][Idx] = Operand;
11032 }
11033 } else {
11034 buildOriginalOperands(S, VL, Operands);
11035 }
11036 return Operands;
11037 }
11038};
11039} // namespace
11040
11041BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11042 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11043 bool TryCopyableElementsVectorization) const {
11044 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11045
11046 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11047 InstructionsState S = Analysis.buildInstructionsState(
11048 VL, *this, TryCopyableElementsVectorization,
11049 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11050
11051 // Don't go into catchswitch blocks, which can happen with PHIs.
11052 // Such blocks can only have PHIs and the catchswitch. There is no
11053 // place to insert a shuffle if we need to, so just avoid that issue.
11054 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
11055 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11056 // Do not try to pack to avoid extra instructions here.
11057 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11058 /*TryToFindDuplicates=*/false);
11059 }
11060
11061 // Check if this is a duplicate of another entry.
11062 if (S) {
11063 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11064 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11065 if (E->isSame(VL)) {
11066 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11067 << ".\n");
11068 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11069 }
11070 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11071 if (all_of(VL, [&](Value *V) {
11072 return isa<PoisonValue>(V) || Values.contains(V) ||
11073 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11074 LI->getLoopFor(S.getMainOp()->getParent()) &&
11075 isVectorized(V));
11076 })) {
11077 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11078 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11079 }
11080 }
11081 }
11082
11083 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11084 // a load), in which case peek through to include it in the tree, without
11085 // ballooning over-budget.
11086 if (Depth >= RecursionMaxDepth &&
11087 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
11088 (match(S.getMainOp(), m_Load(m_Value())) ||
11089 all_of(VL, [&S](const Value *I) {
11090 return match(I,
11092 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11093 })))) {
11094 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11095 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11096 }
11097
11098 // Don't handle scalable vectors
11099 if (S && S.getOpcode() == Instruction::ExtractElement &&
11101 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11102 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11103 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11104 }
11105
11106 // Don't handle vectors.
11107 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11108 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11109 // Do not try to pack to avoid extra instructions here.
11110 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11111 /*TryToFindDuplicates=*/false);
11112 }
11113
11114 // If all of the operands are identical or constant we have a simple solution.
11115 // If we deal with insert/extract instructions, they all must have constant
11116 // indices, otherwise we should gather them, not try to vectorize.
11117 // If alternate op node with 2 elements with gathered operands - do not
11118 // vectorize.
11119 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11120 if (!S || !S.isAltShuffle() || VL.size() > 2)
11121 return false;
11122 if (VectorizableTree.size() < MinTreeSize)
11123 return false;
11124 if (Depth >= RecursionMaxDepth - 1)
11125 return true;
11126 // Check if all operands are extracts, part of vector node or can build a
11127 // regular vectorize node.
11128 SmallVector<unsigned, 8> InstsCount;
11129 for (Value *V : VL) {
11130 auto *I = cast<Instruction>(V);
11131 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11132 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11133 }));
11134 }
11135 bool IsCommutative =
11136 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11137 if ((IsCommutative &&
11138 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11139 (!IsCommutative &&
11140 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11141 return true;
11142 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11144 auto *I1 = cast<Instruction>(VL.front());
11145 auto *I2 = cast<Instruction>(VL.back());
11146 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11147 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11148 I2->getOperand(Op));
11149 if (static_cast<unsigned>(count_if(
11150 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11152 })) >= S.getMainOp()->getNumOperands() / 2)
11153 return false;
11154 if (S.getMainOp()->getNumOperands() > 2)
11155 return true;
11156 if (IsCommutative) {
11157 // Check permuted operands.
11158 Candidates.clear();
11159 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11160 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11161 I2->getOperand((Op + 1) % E));
11162 if (any_of(
11163 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11165 }))
11166 return false;
11167 }
11168 return true;
11169 };
11170 SmallVector<unsigned> SortedIndices;
11171 BasicBlock *BB = nullptr;
11172 bool IsScatterVectorizeUserTE =
11173 UserTreeIdx.UserTE &&
11174 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11175 bool AreAllSameBlock = S.valid();
11176 bool AreScatterAllGEPSameBlock =
11177 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11178 VL.size() > 2 &&
11179 all_of(VL,
11180 [&BB](Value *V) {
11181 auto *I = dyn_cast<GetElementPtrInst>(V);
11182 if (!I)
11183 return doesNotNeedToBeScheduled(V);
11184 if (!BB)
11185 BB = I->getParent();
11186 return BB == I->getParent() && I->getNumOperands() == 2;
11187 }) &&
11188 BB &&
11189 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11190 SortedIndices));
11191 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11192 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
11193 (S &&
11195 S.getMainOp()) &&
11197 NotProfitableForVectorization(VL)) {
11198 if (!S) {
11199 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11200 "C,S,B,O, small shuffle. \n";
11201 dbgs() << "[";
11202 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11203 dbgs() << "]\n");
11204 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11205 /*TryToFindDuplicates=*/true,
11206 /*TrySplitVectorize=*/true);
11207 }
11208 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11209 dbgs() << "[";
11210 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11211 dbgs() << "]\n");
11212 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11213 }
11214
11215 // Don't vectorize ephemeral values.
11216 if (S && !EphValues.empty()) {
11217 for (Value *V : VL) {
11218 if (EphValues.count(V)) {
11219 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11220 << ") is ephemeral.\n");
11221 // Do not try to pack to avoid extra instructions here.
11222 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11223 /*TryToFindDuplicates=*/false);
11224 }
11225 }
11226 }
11227
11228 // We now know that this is a vector of instructions of the same type from
11229 // the same block.
11230
11231 // Check that none of the instructions in the bundle are already in the tree
11232 // and the node may be not profitable for the vectorization as the small
11233 // alternate node.
11234 if (S && S.isAltShuffle()) {
11235 auto GetNumVectorizedExtracted = [&]() {
11236 APInt Extracted = APInt::getZero(VL.size());
11237 APInt Vectorized = APInt::getAllOnes(VL.size());
11238 for (auto [Idx, V] : enumerate(VL)) {
11239 auto *I = dyn_cast<Instruction>(V);
11240 if (!I || doesNotNeedToBeScheduled(I) ||
11241 all_of(I->operands(), [&](const Use &U) {
11242 return isa<ExtractElementInst>(U.get());
11243 }))
11244 continue;
11245 if (isVectorized(I))
11246 Vectorized.clearBit(Idx);
11247 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11248 Extracted.setBit(Idx);
11249 }
11250 return std::make_pair(Vectorized, Extracted);
11251 };
11252 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11254 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11255 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11256 // Rough cost estimation, if the vector code (+ potential extracts) is
11257 // more profitable than the scalar + buildvector.
11258 Type *ScalarTy = VL.front()->getType();
11259 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11260 InstructionCost VectorizeCostEstimate =
11261 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11262 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11263 /*Insert=*/false, /*Extract=*/true, Kind);
11264 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11265 *TTI, ScalarTy, VecTy, Vectorized,
11266 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11267 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11268 }
11269 if (PreferScalarize) {
11270 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11271 "node is not profitable.\n");
11272 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11273 }
11274 }
11275
11276 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11277 if (UserIgnoreList && !UserIgnoreList->empty()) {
11278 for (Value *V : VL) {
11279 if (UserIgnoreList->contains(V)) {
11280 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11281 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11282 }
11283 }
11284 }
11285
11286 // Special processing for sorted pointers for ScatterVectorize node with
11287 // constant indeces only.
11288 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11289 assert(VL.front()->getType()->isPointerTy() &&
11291 "Expected pointers only.");
11292 // Reset S to make it GetElementPtr kind of node.
11293 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11294 assert(It != VL.end() && "Expected at least one GEP.");
11295 S = getSameOpcode(*It, *TLI);
11296 }
11297
11298 // Check that all of the users of the scalars that we want to vectorize are
11299 // schedulable.
11300 Instruction *VL0 = S.getMainOp();
11301 BB = VL0->getParent();
11302
11303 if (S &&
11305 !DT->isReachableFromEntry(BB))) {
11306 // Don't go into unreachable blocks. They may contain instructions with
11307 // dependency cycles which confuse the final scheduling.
11308 // Do not vectorize EH and non-returning blocks, not profitable in most
11309 // cases.
11310 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11311 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11312 }
11313 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11314}
11315
11316void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11317 const EdgeInfo &UserTreeIdx,
11318 unsigned InterleaveFactor) {
11319 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11320
11321 SmallVector<int> ReuseShuffleIndices;
11322 SmallVector<Value *> VL(VLRef);
11323
11324 // Tries to build split node.
11325 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11326 SmallVector<Value *> Op1, Op2;
11327 OrdersType ReorderIndices;
11328 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11329 return false;
11330
11331 auto Invalid = ScheduleBundle::invalid();
11332 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11333 UserTreeIdx, {}, ReorderIndices);
11334 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11335 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11336 InstructionsState S = getSameOpcode(Op, *TLI);
11337 if (S && (isa<LoadInst>(S.getMainOp()) ||
11338 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11339 // Build gather node for loads, they will be gathered later.
11340 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11341 Idx == 0 ? 0 : Op1.size());
11342 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11343 } else {
11344 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11345 Idx == 0 ? 0 : Op1.size());
11346 buildTreeRec(Op, Depth, {TE, Idx});
11347 }
11348 };
11349 AddNode(Op1, 0);
11350 AddNode(Op2, 1);
11351 return true;
11352 };
11353
11354 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11355 bool AreConsts = false;
11356 for (Value *V : VL) {
11357 if (isa<PoisonValue>(V))
11358 continue;
11359 if (isa<Constant>(V)) {
11360 AreConsts = true;
11361 continue;
11362 }
11363 if (!isa<PHINode>(V))
11364 return false;
11365 }
11366 return AreConsts;
11367 };
11368 if (AreOnlyConstsWithPHIs(VL)) {
11369 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11370 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11371 return;
11372 }
11373
11374 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11375 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11376 InstructionsState S = Legality.getInstructionsState();
11377 if (!Legality.isLegal()) {
11378 if (Legality.trySplitVectorize()) {
11379 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11380 // Last chance to try to vectorize alternate node.
11381 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11382 return;
11383 }
11384 if (!S)
11385 Legality = getScalarsVectorizationLegality(
11386 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11387 if (!Legality.isLegal()) {
11388 if (Legality.tryToFindDuplicates())
11389 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11390 UserTreeIdx);
11391
11392 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11393 return;
11394 }
11395 S = Legality.getInstructionsState();
11396 }
11397
11398 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11399 if (S.isAltShuffle() && TrySplitNode(S))
11400 return;
11401
11402 // Check that every instruction appears once in this bundle.
11403 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11404 /*TryPad=*/true)) {
11405 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11406 return;
11407 }
11408
11409 // Perform specific checks for each particular instruction kind.
11410 bool IsScatterVectorizeUserTE =
11411 UserTreeIdx.UserTE &&
11412 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11413 OrdersType CurrentOrder;
11414 SmallVector<Value *> PointerOps;
11415 StridedPtrInfo SPtrInfo;
11416 TreeEntry::EntryState State = getScalarsVectorizationState(
11417 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11418 if (State == TreeEntry::NeedToGather) {
11419 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11420 return;
11421 }
11422
11423 Instruction *VL0 = S.getMainOp();
11424 BasicBlock *BB = VL0->getParent();
11425 auto &BSRef = BlocksSchedules[BB];
11426 if (!BSRef)
11427 BSRef = std::make_unique<BlockScheduling>(BB);
11428
11429 BlockScheduling &BS = *BSRef;
11430
11431 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11432 std::optional<ScheduleBundle *> BundlePtr =
11433 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11434#ifdef EXPENSIVE_CHECKS
11435 // Make sure we didn't break any internal invariants
11436 BS.verify();
11437#endif
11438 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11439 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11440 // Last chance to try to vectorize alternate node.
11441 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11442 return;
11443 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11444 NonScheduledFirst.insert(VL.front());
11445 if (S.getOpcode() == Instruction::Load &&
11446 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11448 return;
11449 }
11450 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11451 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11452 ScheduleBundle Empty;
11453 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11454 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11455
11456 unsigned ShuffleOrOp =
11457 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11458 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11459 // Postpone PHI nodes creation
11460 SmallVector<unsigned> PHIOps;
11461 for (unsigned I : seq<unsigned>(Operands.size())) {
11463 if (Op.empty())
11464 continue;
11465 InstructionsState S = getSameOpcode(Op, *TLI);
11466 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11467 buildTreeRec(Op, Depth + 1, {TE, I});
11468 else
11469 PHIOps.push_back(I);
11470 }
11471 for (unsigned I : PHIOps)
11472 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11473 };
11474 switch (ShuffleOrOp) {
11475 case Instruction::PHI: {
11476 TreeEntry *TE =
11477 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11478 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11479 TE->dump());
11480
11481 TE->setOperands(Operands);
11482 CreateOperandNodes(TE, Operands);
11483 return;
11484 }
11485 case Instruction::ExtractValue:
11486 case Instruction::ExtractElement: {
11487 if (CurrentOrder.empty()) {
11488 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11489 } else {
11490 LLVM_DEBUG({
11491 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11492 "with order";
11493 for (unsigned Idx : CurrentOrder)
11494 dbgs() << " " << Idx;
11495 dbgs() << "\n";
11496 });
11497 fixupOrderingIndices(CurrentOrder);
11498 }
11499 // Insert new order with initial value 0, if it does not exist,
11500 // otherwise return the iterator to the existing one.
11501 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11502 ReuseShuffleIndices, CurrentOrder);
11503 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11504 "(ExtractValueInst/ExtractElementInst).\n";
11505 TE->dump());
11506 // This is a special case, as it does not gather, but at the same time
11507 // we are not extending buildTreeRec() towards the operands.
11508 TE->setOperands(Operands);
11509 return;
11510 }
11511 case Instruction::InsertElement: {
11512 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11513
11514 auto OrdCompare = [](const std::pair<int, int> &P1,
11515 const std::pair<int, int> &P2) {
11516 return P1.first > P2.first;
11517 };
11518 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
11519 decltype(OrdCompare)>
11520 Indices(OrdCompare);
11521 for (int I = 0, E = VL.size(); I < E; ++I) {
11522 unsigned Idx = *getElementIndex(VL[I]);
11523 Indices.emplace(Idx, I);
11524 }
11525 OrdersType CurrentOrder(VL.size(), VL.size());
11526 bool IsIdentity = true;
11527 for (int I = 0, E = VL.size(); I < E; ++I) {
11528 CurrentOrder[Indices.top().second] = I;
11529 IsIdentity &= Indices.top().second == I;
11530 Indices.pop();
11531 }
11532 if (IsIdentity)
11533 CurrentOrder.clear();
11534 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11535 {}, CurrentOrder);
11536 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
11537 TE->dump());
11538
11539 TE->setOperands(Operands);
11540 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
11541 return;
11542 }
11543 case Instruction::Load: {
11544 // Check that a vectorized load would load the same memory as a scalar
11545 // load. For example, we don't want to vectorize loads that are smaller
11546 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
11547 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
11548 // from such a struct, we read/write packed bits disagreeing with the
11549 // unvectorized version.
11550 TreeEntry *TE = nullptr;
11551 fixupOrderingIndices(CurrentOrder);
11552 switch (State) {
11553 case TreeEntry::Vectorize:
11554 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11555 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11556 if (CurrentOrder.empty())
11557 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
11558 TE->dump());
11559 else
11561 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
11562 TE->dump());
11563 break;
11564 case TreeEntry::CompressVectorize:
11565 // Vectorizing non-consecutive loads with (masked)load + compress.
11566 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11567 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11568 LLVM_DEBUG(
11569 dbgs()
11570 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11571 TE->dump());
11572 break;
11573 case TreeEntry::StridedVectorize:
11574 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11575 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11576 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11577 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
11578 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11579 TE->dump());
11580 break;
11581 case TreeEntry::ScatterVectorize:
11582 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11583 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11584 UserTreeIdx, ReuseShuffleIndices);
11585 LLVM_DEBUG(
11586 dbgs()
11587 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11588 TE->dump());
11589 break;
11590 case TreeEntry::CombinedVectorize:
11591 case TreeEntry::SplitVectorize:
11592 case TreeEntry::NeedToGather:
11593 llvm_unreachable("Unexpected loads state.");
11594 }
11595 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11596 assert(Operands.size() == 1 && "Expected a single operand only");
11597 SmallVector<int> Mask;
11598 inversePermutation(CurrentOrder, Mask);
11599 reorderScalars(Operands.front(), Mask);
11600 }
11601 TE->setOperands(Operands);
11602 if (State == TreeEntry::ScatterVectorize)
11603 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
11604 return;
11605 }
11606 case Instruction::ZExt:
11607 case Instruction::SExt:
11608 case Instruction::FPToUI:
11609 case Instruction::FPToSI:
11610 case Instruction::FPExt:
11611 case Instruction::PtrToInt:
11612 case Instruction::IntToPtr:
11613 case Instruction::SIToFP:
11614 case Instruction::UIToFP:
11615 case Instruction::Trunc:
11616 case Instruction::FPTrunc:
11617 case Instruction::BitCast: {
11618 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11619 std::make_pair(std::numeric_limits<unsigned>::min(),
11620 std::numeric_limits<unsigned>::max()));
11621 if (ShuffleOrOp == Instruction::ZExt ||
11622 ShuffleOrOp == Instruction::SExt) {
11623 CastMaxMinBWSizes = std::make_pair(
11624 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11625 PrevMaxBW),
11626 std::min<unsigned>(
11627 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11628 PrevMinBW));
11629 } else if (ShuffleOrOp == Instruction::Trunc) {
11630 CastMaxMinBWSizes = std::make_pair(
11631 std::max<unsigned>(
11632 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11633 PrevMaxBW),
11634 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11635 PrevMinBW));
11636 }
11637 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11638 ReuseShuffleIndices);
11639 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
11640 TE->dump());
11641
11642 TE->setOperands(Operands);
11643 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11644 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11645 if (ShuffleOrOp == Instruction::Trunc) {
11646 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11647 } else if (ShuffleOrOp == Instruction::SIToFP ||
11648 ShuffleOrOp == Instruction::UIToFP) {
11649 unsigned NumSignBits =
11650 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11651 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
11652 APInt Mask = DB->getDemandedBits(OpI);
11653 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
11654 }
11655 if (NumSignBits * 2 >=
11656 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11657 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11658 }
11659 return;
11660 }
11661 case Instruction::ICmp:
11662 case Instruction::FCmp: {
11663 // Check that all of the compares have the same predicate.
11664 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11665 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11666 ReuseShuffleIndices);
11667 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
11668 TE->dump());
11669
11670 VLOperands Ops(VL, Operands, S, *this);
11671 if (cast<CmpInst>(VL0)->isCommutative()) {
11672 // Commutative predicate - collect + sort operands of the instructions
11673 // so that each side is more likely to have the same opcode.
11675 "Commutative Predicate mismatch");
11676 Ops.reorder();
11677 Operands.front() = Ops.getVL(0);
11678 Operands.back() = Ops.getVL(1);
11679 } else {
11680 // Collect operands - commute if it uses the swapped predicate.
11681 for (auto [Idx, V] : enumerate(VL)) {
11682 if (isa<PoisonValue>(V))
11683 continue;
11684 auto *Cmp = cast<CmpInst>(V);
11685 if (Cmp->getPredicate() != P0)
11686 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11687 }
11688 }
11689 TE->setOperands(Operands);
11690 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11691 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11692 if (ShuffleOrOp == Instruction::ICmp) {
11693 unsigned NumSignBits0 =
11694 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11695 if (NumSignBits0 * 2 >=
11696 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11697 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11698 unsigned NumSignBits1 =
11699 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
11700 if (NumSignBits1 * 2 >=
11701 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
11702 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11703 }
11704 return;
11705 }
11706 case Instruction::Select:
11707 case Instruction::FNeg:
11708 case Instruction::Add:
11709 case Instruction::FAdd:
11710 case Instruction::Sub:
11711 case Instruction::FSub:
11712 case Instruction::Mul:
11713 case Instruction::FMul:
11714 case Instruction::UDiv:
11715 case Instruction::SDiv:
11716 case Instruction::FDiv:
11717 case Instruction::URem:
11718 case Instruction::SRem:
11719 case Instruction::FRem:
11720 case Instruction::Shl:
11721 case Instruction::LShr:
11722 case Instruction::AShr:
11723 case Instruction::And:
11724 case Instruction::Or:
11725 case Instruction::Xor:
11726 case Instruction::Freeze: {
11727 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11728 ReuseShuffleIndices);
11729 LLVM_DEBUG(
11730 dbgs() << "SLP: added a new TreeEntry "
11731 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11732 TE->dump());
11733
11734 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
11735 VLOperands Ops(VL, Operands, S, *this);
11736 Ops.reorder();
11737 Operands[0] = Ops.getVL(0);
11738 Operands[1] = Ops.getVL(1);
11739 }
11740 TE->setOperands(Operands);
11741 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11742 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11743 return;
11744 }
11745 case Instruction::GetElementPtr: {
11746 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11747 ReuseShuffleIndices);
11748 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
11749 TE->dump());
11750 TE->setOperands(Operands);
11751
11752 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
11753 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11754 return;
11755 }
11756 case Instruction::Store: {
11757 bool Consecutive = CurrentOrder.empty();
11758 if (!Consecutive)
11759 fixupOrderingIndices(CurrentOrder);
11760 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11761 ReuseShuffleIndices, CurrentOrder);
11762 if (Consecutive)
11763 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
11764 TE->dump());
11765 else
11766 LLVM_DEBUG(
11767 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
11768 TE->dump());
11769 TE->setOperands(Operands);
11770 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
11771 return;
11772 }
11773 case Instruction::Call: {
11774 // Check if the calls are all to the same vectorizable intrinsic or
11775 // library function.
11776 CallInst *CI = cast<CallInst>(VL0);
11778
11779 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11780 ReuseShuffleIndices);
11781 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
11782 TE->dump());
11783 if (isCommutative(VL0)) {
11784 VLOperands Ops(VL, Operands, S, *this);
11785 Ops.reorder();
11786 Operands[0] = Ops.getVL(0);
11787 Operands[1] = Ops.getVL(1);
11788 }
11789 TE->setOperands(Operands);
11790 for (unsigned I : seq<unsigned>(CI->arg_size())) {
11791 // For scalar operands no need to create an entry since no need to
11792 // vectorize it.
11794 continue;
11795 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11796 }
11797 return;
11798 }
11799 case Instruction::ShuffleVector: {
11800 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11801 ReuseShuffleIndices);
11802 if (S.isAltShuffle()) {
11803 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
11804 TE->dump());
11805 } else {
11806 assert(SLPReVec && "Only supported by REVEC.");
11807 LLVM_DEBUG(
11808 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11809 TE->dump());
11810 }
11811
11812 // Reorder operands if reordering would enable vectorization.
11813 auto *CI = dyn_cast<CmpInst>(VL0);
11814 if (CI && any_of(VL, [](Value *V) {
11815 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
11816 })) {
11817 auto *MainCI = cast<CmpInst>(S.getMainOp());
11818 auto *AltCI = cast<CmpInst>(S.getAltOp());
11819 CmpInst::Predicate MainP = MainCI->getPredicate();
11820 CmpInst::Predicate AltP = AltCI->getPredicate();
11821 assert(MainP != AltP &&
11822 "Expected different main/alternate predicates.");
11823 // Collect operands - commute if it uses the swapped predicate or
11824 // alternate operation.
11825 for (auto [Idx, V] : enumerate(VL)) {
11826 if (isa<PoisonValue>(V))
11827 continue;
11828 auto *Cmp = cast<CmpInst>(V);
11829
11830 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
11831 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11832 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11833 } else {
11834 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11835 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11836 }
11837 }
11838 TE->setOperands(Operands);
11839 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11840 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11841 return;
11842 }
11843
11844 if (isa<BinaryOperator>(VL0) || CI) {
11845 VLOperands Ops(VL, Operands, S, *this);
11846 Ops.reorder();
11847 Operands[0] = Ops.getVL(0);
11848 Operands[1] = Ops.getVL(1);
11849 }
11850 TE->setOperands(Operands);
11851 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11852 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11853 return;
11854 }
11855 default:
11856 break;
11857 }
11858 llvm_unreachable("Unexpected vectorization of the instructions.");
11859}
11860
11862 unsigned N = 1;
11863 Type *EltTy = T;
11864
11866 if (EltTy->isEmptyTy())
11867 return 0;
11868 if (auto *ST = dyn_cast<StructType>(EltTy)) {
11869 // Check that struct is homogeneous.
11870 for (const auto *Ty : ST->elements())
11871 if (Ty != *ST->element_begin())
11872 return 0;
11873 N *= ST->getNumElements();
11874 EltTy = *ST->element_begin();
11875 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
11876 N *= AT->getNumElements();
11877 EltTy = AT->getElementType();
11878 } else {
11879 auto *VT = cast<FixedVectorType>(EltTy);
11880 N *= VT->getNumElements();
11881 EltTy = VT->getElementType();
11882 }
11883 }
11884
11885 if (!isValidElementType(EltTy))
11886 return 0;
11887 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
11888 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11889 VTSize != DL->getTypeStoreSizeInBits(T))
11890 return 0;
11891 return N;
11892}
11893
11894bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
11895 SmallVectorImpl<unsigned> &CurrentOrder,
11896 bool ResizeAllowed) const {
11898 assert(It != VL.end() && "Expected at least one extract instruction.");
11899 auto *E0 = cast<Instruction>(*It);
11900 assert(
11902 "Invalid opcode");
11903 // Check if all of the extracts come from the same vector and from the
11904 // correct offset.
11905 Value *Vec = E0->getOperand(0);
11906
11907 CurrentOrder.clear();
11908
11909 // We have to extract from a vector/aggregate with the same number of elements.
11910 unsigned NElts;
11911 if (E0->getOpcode() == Instruction::ExtractValue) {
11912 NElts = canMapToVector(Vec->getType());
11913 if (!NElts)
11914 return false;
11915 // Check if load can be rewritten as load of vector.
11916 LoadInst *LI = dyn_cast<LoadInst>(Vec);
11917 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
11918 return false;
11919 } else {
11920 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
11921 }
11922
11923 unsigned E = VL.size();
11924 if (!ResizeAllowed && NElts != E)
11925 return false;
11927 unsigned MinIdx = NElts, MaxIdx = 0;
11928 for (auto [I, V] : enumerate(VL)) {
11929 auto *Inst = dyn_cast<Instruction>(V);
11930 if (!Inst)
11931 continue;
11932 if (Inst->getOperand(0) != Vec)
11933 return false;
11934 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
11935 if (isa<UndefValue>(EE->getIndexOperand()))
11936 continue;
11937 std::optional<unsigned> Idx = getExtractIndex(Inst);
11938 if (!Idx)
11939 return false;
11940 const unsigned ExtIdx = *Idx;
11941 if (ExtIdx >= NElts)
11942 continue;
11943 Indices[I] = ExtIdx;
11944 if (MinIdx > ExtIdx)
11945 MinIdx = ExtIdx;
11946 if (MaxIdx < ExtIdx)
11947 MaxIdx = ExtIdx;
11948 }
11949 if (MaxIdx - MinIdx + 1 > E)
11950 return false;
11951 if (MaxIdx + 1 <= E)
11952 MinIdx = 0;
11953
11954 // Check that all of the indices extract from the correct offset.
11955 bool ShouldKeepOrder = true;
11956 // Assign to all items the initial value E + 1 so we can check if the extract
11957 // instruction index was used already.
11958 // Also, later we can check that all the indices are used and we have a
11959 // consecutive access in the extract instructions, by checking that no
11960 // element of CurrentOrder still has value E + 1.
11961 CurrentOrder.assign(E, E);
11962 for (unsigned I = 0; I < E; ++I) {
11963 if (Indices[I] == PoisonMaskElem)
11964 continue;
11965 const unsigned ExtIdx = Indices[I] - MinIdx;
11966 if (CurrentOrder[ExtIdx] != E) {
11967 CurrentOrder.clear();
11968 return false;
11969 }
11970 ShouldKeepOrder &= ExtIdx == I;
11971 CurrentOrder[ExtIdx] = I;
11972 }
11973 if (ShouldKeepOrder)
11974 CurrentOrder.clear();
11975
11976 return ShouldKeepOrder;
11977}
11978
11979bool BoUpSLP::areAllUsersVectorized(
11980 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
11981 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
11982 all_of(I->users(), [this](User *U) {
11983 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
11984 (isa<ExtractElementInst>(U) && MustGather.contains(U));
11985 });
11986}
11987
11988void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
11989 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
11990 SmallVectorImpl<Value *> *OpScalars,
11991 SmallVectorImpl<Value *> *AltScalars) const {
11992 unsigned Sz = Scalars.size();
11993 Mask.assign(Sz, PoisonMaskElem);
11994 SmallVector<int> OrderMask;
11995 if (!ReorderIndices.empty())
11996 inversePermutation(ReorderIndices, OrderMask);
11997 for (unsigned I = 0; I < Sz; ++I) {
11998 unsigned Idx = I;
11999 if (!ReorderIndices.empty())
12000 Idx = OrderMask[I];
12001 if (isa<PoisonValue>(Scalars[Idx]))
12002 continue;
12003 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12004 if (IsAltOp(OpInst)) {
12005 Mask[I] = Sz + Idx;
12006 if (AltScalars)
12007 AltScalars->push_back(OpInst);
12008 } else {
12009 Mask[I] = Idx;
12010 if (OpScalars)
12011 OpScalars->push_back(OpInst);
12012 }
12013 }
12014 if (!ReuseShuffleIndices.empty()) {
12015 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12016 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12017 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12018 });
12019 Mask.swap(NewMask);
12020 }
12021}
12022
12024 Instruction *AltOp,
12025 const TargetLibraryInfo &TLI) {
12026 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12027}
12028
12030 Instruction *AltOp,
12031 const TargetLibraryInfo &TLI) {
12032 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12033 auto *AltCI = cast<CmpInst>(AltOp);
12034 CmpInst::Predicate MainP = MainCI->getPredicate();
12035 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12036 assert(MainP != AltP && "Expected different main/alternate predicates.");
12037 auto *CI = cast<CmpInst>(I);
12038 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12039 return false;
12040 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12041 return true;
12042 CmpInst::Predicate P = CI->getPredicate();
12044
12045 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12046 "CmpInst expected to match either main or alternate predicate or "
12047 "their swap.");
12048 return MainP != P && MainP != SwappedP;
12049 }
12050 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12051}
12052
12053TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12054 assert(!Ops.empty());
12055 const auto *Op0 = Ops.front();
12056
12057 const bool IsConstant = all_of(Ops, [](Value *V) {
12058 // TODO: We should allow undef elements here
12059 return isConstant(V) && !isa<UndefValue>(V);
12060 });
12061 const bool IsUniform = all_of(Ops, [=](Value *V) {
12062 // TODO: We should allow undef elements here
12063 return V == Op0;
12064 });
12065 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12066 // TODO: We should allow undef elements here
12067 if (auto *CI = dyn_cast<ConstantInt>(V))
12068 return CI->getValue().isPowerOf2();
12069 return false;
12070 });
12071 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12072 // TODO: We should allow undef elements here
12073 if (auto *CI = dyn_cast<ConstantInt>(V))
12074 return CI->getValue().isNegatedPowerOf2();
12075 return false;
12076 });
12077
12079 if (IsConstant && IsUniform)
12081 else if (IsConstant)
12083 else if (IsUniform)
12085
12087 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12088 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12089
12090 return {VK, VP};
12091}
12092
12093namespace {
12094/// The base class for shuffle instruction emission and shuffle cost estimation.
12095class BaseShuffleAnalysis {
12096protected:
12097 Type *ScalarTy = nullptr;
12098
12099 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12100
12101 /// V is expected to be a vectorized value.
12102 /// When REVEC is disabled, there is no difference between VF and
12103 /// VNumElements.
12104 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12105 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12106 /// of 8.
12107 unsigned getVF(Value *V) const {
12108 assert(V && "V cannot be nullptr");
12109 assert(isa<FixedVectorType>(V->getType()) &&
12110 "V does not have FixedVectorType");
12111 assert(ScalarTy && "ScalarTy cannot be nullptr");
12112 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12113 unsigned VNumElements =
12114 cast<FixedVectorType>(V->getType())->getNumElements();
12115 assert(VNumElements > ScalarTyNumElements &&
12116 "the number of elements of V is not large enough");
12117 assert(VNumElements % ScalarTyNumElements == 0 &&
12118 "the number of elements of V is not a vectorized value");
12119 return VNumElements / ScalarTyNumElements;
12120 }
12121
12122 /// Checks if the mask is an identity mask.
12123 /// \param IsStrict if is true the function returns false if mask size does
12124 /// not match vector size.
12125 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12126 bool IsStrict) {
12127 int Limit = Mask.size();
12128 int VF = VecTy->getNumElements();
12129 int Index = -1;
12130 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12131 return true;
12132 if (!IsStrict) {
12133 // Consider extract subvector starting from index 0.
12134 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12135 Index == 0)
12136 return true;
12137 // All VF-size submasks are identity (e.g.
12138 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12139 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12140 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12141 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
12143 }))
12144 return true;
12145 }
12146 return false;
12147 }
12148
12149 /// Tries to combine 2 different masks into single one.
12150 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12151 /// change the size of the vector, \p LocalVF is the original size of the
12152 /// shuffled vector.
12153 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12154 ArrayRef<int> ExtMask) {
12155 unsigned VF = Mask.size();
12156 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12157 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12158 if (ExtMask[I] == PoisonMaskElem)
12159 continue;
12160 int MaskedIdx = Mask[ExtMask[I] % VF];
12161 NewMask[I] =
12162 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12163 }
12164 Mask.swap(NewMask);
12165 }
12166
12167 /// Looks through shuffles trying to reduce final number of shuffles in the
12168 /// code. The function looks through the previously emitted shuffle
12169 /// instructions and properly mark indices in mask as undef.
12170 /// For example, given the code
12171 /// \code
12172 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12173 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12174 /// \endcode
12175 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12176 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12177 /// <0, 1, 2, 3> for the shuffle.
12178 /// If 2 operands are of different size, the smallest one will be resized and
12179 /// the mask recalculated properly.
12180 /// For example, given the code
12181 /// \code
12182 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12183 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12184 /// \endcode
12185 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12186 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12187 /// <0, 1, 2, 3> for the shuffle.
12188 /// So, it tries to transform permutations to simple vector merge, if
12189 /// possible.
12190 /// \param V The input vector which must be shuffled using the given \p Mask.
12191 /// If the better candidate is found, \p V is set to this best candidate
12192 /// vector.
12193 /// \param Mask The input mask for the shuffle. If the best candidate is found
12194 /// during looking-through-shuffles attempt, it is updated accordingly.
12195 /// \param SinglePermute true if the shuffle operation is originally a
12196 /// single-value-permutation. In this case the look-through-shuffles procedure
12197 /// may look for resizing shuffles as the best candidates.
12198 /// \return true if the shuffle results in the non-resizing identity shuffle
12199 /// (and thus can be ignored), false - otherwise.
12200 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12201 bool SinglePermute) {
12202 Value *Op = V;
12203 ShuffleVectorInst *IdentityOp = nullptr;
12204 SmallVector<int> IdentityMask;
12205 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12206 // Exit if not a fixed vector type or changing size shuffle.
12207 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12208 if (!SVTy)
12209 break;
12210 // Remember the identity or broadcast mask, if it is not a resizing
12211 // shuffle. If no better candidates are found, this Op and Mask will be
12212 // used in the final shuffle.
12213 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12214 if (!IdentityOp || !SinglePermute ||
12215 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12217 IdentityMask.size()))) {
12218 IdentityOp = SV;
12219 // Store current mask in the IdentityMask so later we did not lost
12220 // this info if IdentityOp is selected as the best candidate for the
12221 // permutation.
12222 IdentityMask.assign(Mask);
12223 }
12224 }
12225 // Remember the broadcast mask. If no better candidates are found, this Op
12226 // and Mask will be used in the final shuffle.
12227 // Zero splat can be used as identity too, since it might be used with
12228 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12229 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12230 // expensive, the analysis founds out, that the source vector is just a
12231 // broadcast, this original mask can be transformed to identity mask <0,
12232 // 1, 2, 3>.
12233 // \code
12234 // %0 = shuffle %v, poison, zeroinitalizer
12235 // %res = shuffle %0, poison, <3, 1, 2, 0>
12236 // \endcode
12237 // may be transformed to
12238 // \code
12239 // %0 = shuffle %v, poison, zeroinitalizer
12240 // %res = shuffle %0, poison, <0, 1, 2, 3>
12241 // \endcode
12242 if (SV->isZeroEltSplat()) {
12243 IdentityOp = SV;
12244 IdentityMask.assign(Mask);
12245 }
12246 int LocalVF = Mask.size();
12247 if (auto *SVOpTy =
12248 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12249 LocalVF = SVOpTy->getNumElements();
12250 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12251 for (auto [Idx, I] : enumerate(Mask)) {
12252 if (I == PoisonMaskElem ||
12253 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12254 continue;
12255 ExtMask[Idx] = SV->getMaskValue(I);
12256 }
12257 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12258 SV->getOperand(0),
12259 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12260 .all();
12261 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12262 SV->getOperand(1),
12263 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12264 .all();
12265 if (!IsOp1Undef && !IsOp2Undef) {
12266 // Update mask and mark undef elems.
12267 for (int &I : Mask) {
12268 if (I == PoisonMaskElem)
12269 continue;
12270 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12272 I = PoisonMaskElem;
12273 }
12274 break;
12275 }
12276 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12277 combineMasks(LocalVF, ShuffleMask, Mask);
12278 Mask.swap(ShuffleMask);
12279 if (IsOp2Undef)
12280 Op = SV->getOperand(0);
12281 else
12282 Op = SV->getOperand(1);
12283 }
12284 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12285 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12287 if (IdentityOp) {
12288 V = IdentityOp;
12289 assert(Mask.size() == IdentityMask.size() &&
12290 "Expected masks of same sizes.");
12291 // Clear known poison elements.
12292 for (auto [I, Idx] : enumerate(Mask))
12293 if (Idx == PoisonMaskElem)
12294 IdentityMask[I] = PoisonMaskElem;
12295 Mask.swap(IdentityMask);
12296 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12297 return SinglePermute &&
12298 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12299 /*IsStrict=*/true) ||
12300 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12301 Shuffle->isZeroEltSplat() &&
12303 all_of(enumerate(Mask), [&](const auto &P) {
12304 return P.value() == PoisonMaskElem ||
12305 Shuffle->getShuffleMask()[P.index()] == 0;
12306 })));
12307 }
12308 V = Op;
12309 return false;
12310 }
12311 V = Op;
12312 return true;
12313 }
12314
12315 /// Smart shuffle instruction emission, walks through shuffles trees and
12316 /// tries to find the best matching vector for the actual shuffle
12317 /// instruction.
12318 template <typename T, typename ShuffleBuilderTy>
12319 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12320 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12321 assert(V1 && "Expected at least one vector value.");
12322 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12323 SmallVector<int> NewMask(Mask);
12324 if (ScalarTyNumElements != 1) {
12325 assert(SLPReVec && "FixedVectorType is not expected.");
12326 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12327 Mask = NewMask;
12328 }
12329 if (V2)
12330 Builder.resizeToMatch(V1, V2);
12331 int VF = Mask.size();
12332 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12333 VF = FTy->getNumElements();
12335 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12336 .all()) {
12337 // Peek through shuffles.
12338 Value *Op1 = V1;
12339 Value *Op2 = V2;
12340 int VF =
12341 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12342 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12343 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12344 for (int I = 0, E = Mask.size(); I < E; ++I) {
12345 if (Mask[I] < VF)
12346 CombinedMask1[I] = Mask[I];
12347 else
12348 CombinedMask2[I] = Mask[I] - VF;
12349 }
12350 Value *PrevOp1;
12351 Value *PrevOp2;
12352 do {
12353 PrevOp1 = Op1;
12354 PrevOp2 = Op2;
12355 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12356 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12357 // Check if we have 2 resizing shuffles - need to peek through operands
12358 // again.
12359 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12360 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12361 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12362 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12363 if (I == PoisonMaskElem)
12364 continue;
12365 ExtMask1[Idx] = SV1->getMaskValue(I);
12366 }
12367 SmallBitVector UseMask1 = buildUseMask(
12368 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12369 ->getNumElements(),
12370 ExtMask1, UseMask::SecondArg);
12371 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12372 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12373 if (I == PoisonMaskElem)
12374 continue;
12375 ExtMask2[Idx] = SV2->getMaskValue(I);
12376 }
12377 SmallBitVector UseMask2 = buildUseMask(
12378 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12379 ->getNumElements(),
12380 ExtMask2, UseMask::SecondArg);
12381 if (SV1->getOperand(0)->getType() ==
12382 SV2->getOperand(0)->getType() &&
12383 SV1->getOperand(0)->getType() != SV1->getType() &&
12384 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12385 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12386 Op1 = SV1->getOperand(0);
12387 Op2 = SV2->getOperand(0);
12388 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12389 int LocalVF = ShuffleMask1.size();
12390 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12391 LocalVF = FTy->getNumElements();
12392 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12393 CombinedMask1.swap(ShuffleMask1);
12394 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12395 LocalVF = ShuffleMask2.size();
12396 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12397 LocalVF = FTy->getNumElements();
12398 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12399 CombinedMask2.swap(ShuffleMask2);
12400 }
12401 }
12402 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12403 Builder.resizeToMatch(Op1, Op2);
12404 VF = std::max(cast<VectorType>(Op1->getType())
12405 ->getElementCount()
12406 .getKnownMinValue(),
12408 ->getElementCount()
12409 .getKnownMinValue());
12410 for (int I = 0, E = Mask.size(); I < E; ++I) {
12411 if (CombinedMask2[I] != PoisonMaskElem) {
12412 assert(CombinedMask1[I] == PoisonMaskElem &&
12413 "Expected undefined mask element");
12414 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12415 }
12416 }
12417 if (Op1 == Op2 &&
12418 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12419 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12421 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12422 ArrayRef(CombinedMask1))))
12423 return Builder.createIdentity(Op1);
12424 return Builder.createShuffleVector(
12425 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12426 CombinedMask1);
12427 }
12428 if (isa<PoisonValue>(V1))
12429 return Builder.createPoison(
12430 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12431 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12432 assert(V1 && "Expected non-null value after looking through shuffles.");
12433
12434 if (!IsIdentity)
12435 return Builder.createShuffleVector(V1, NewMask);
12436 return Builder.createIdentity(V1);
12437 }
12438
12439 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12440 /// shuffle emission.
12441 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12442 ArrayRef<int> Mask) {
12443 for (unsigned I : seq<unsigned>(CommonMask.size()))
12444 if (Mask[I] != PoisonMaskElem)
12445 CommonMask[I] = I;
12446 }
12447};
12448} // namespace
12449
12450/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12451static std::pair<InstructionCost, InstructionCost>
12453 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12454 Type *ScalarTy, VectorType *VecTy) {
12455 InstructionCost ScalarCost = 0;
12456 InstructionCost VecCost = 0;
12457 // Here we differentiate two cases: (1) when Ptrs represent a regular
12458 // vectorization tree node (as they are pointer arguments of scattered
12459 // loads) or (2) when Ptrs are the arguments of loads or stores being
12460 // vectorized as plane wide unit-stride load/store since all the
12461 // loads/stores are known to be from/to adjacent locations.
12462 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12463 // Case 2: estimate costs for pointer related costs when vectorizing to
12464 // a wide load/store.
12465 // Scalar cost is estimated as a set of pointers with known relationship
12466 // between them.
12467 // For vector code we will use BasePtr as argument for the wide load/store
12468 // but we also need to account all the instructions which are going to
12469 // stay in vectorized code due to uses outside of these scalar
12470 // loads/stores.
12471 ScalarCost = TTI.getPointersChainCost(
12472 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12473 CostKind);
12474
12475 SmallVector<const Value *> PtrsRetainedInVecCode;
12476 for (Value *V : Ptrs) {
12477 if (V == BasePtr) {
12478 PtrsRetainedInVecCode.push_back(V);
12479 continue;
12480 }
12482 // For simplicity assume Ptr to stay in vectorized code if it's not a
12483 // GEP instruction. We don't care since it's cost considered free.
12484 // TODO: We should check for any uses outside of vectorizable tree
12485 // rather than just single use.
12486 if (!Ptr || !Ptr->hasOneUse())
12487 PtrsRetainedInVecCode.push_back(V);
12488 }
12489
12490 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12491 // If all pointers stay in vectorized code then we don't have
12492 // any savings on that.
12493 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
12494 }
12495 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12496 TTI::PointersChainInfo::getKnownStride(),
12497 VecTy, CostKind);
12498 } else {
12499 // Case 1: Ptrs are the arguments of loads that we are going to transform
12500 // into masked gather load intrinsic.
12501 // All the scalar GEPs will be removed as a result of vectorization.
12502 // For any external uses of some lanes extract element instructions will
12503 // be generated (which cost is estimated separately).
12504 TTI::PointersChainInfo PtrsInfo =
12505 all_of(Ptrs,
12506 [](const Value *V) {
12508 return Ptr && !Ptr->hasAllConstantIndices();
12509 })
12510 ? TTI::PointersChainInfo::getUnknownStride()
12511 : TTI::PointersChainInfo::getKnownStride();
12512
12513 ScalarCost =
12514 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
12515 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12516 if (!BaseGEP) {
12517 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
12518 if (It != Ptrs.end())
12519 BaseGEP = cast<GEPOperator>(*It);
12520 }
12521 if (BaseGEP) {
12522 SmallVector<const Value *> Indices(BaseGEP->indices());
12523 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
12524 BaseGEP->getPointerOperand(), Indices, VecTy,
12525 CostKind);
12526 }
12527 }
12528
12529 return std::make_pair(ScalarCost, VecCost);
12530}
12531
12532void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12533 assert(TE.isGather() && TE.ReorderIndices.empty() &&
12534 "Expected gather node without reordering.");
12535 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
12536 SmallSet<size_t, 2> LoadKeyUsed;
12537
12538 // Do not reorder nodes if it small (just 2 elements), all-constant or all
12539 // instructions have same opcode already.
12540 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
12541 all_of(TE.Scalars, isConstant))
12542 return;
12543
12544 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
12545 return VectorizableTree[Idx]->isSame(TE.Scalars);
12546 }))
12547 return;
12548
12549 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
12550 Key = hash_combine(hash_value(LI->getParent()), Key);
12551 Value *Ptr =
12552 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
12553 if (LoadKeyUsed.contains(Key)) {
12554 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
12555 if (LIt != LoadsMap.end()) {
12556 for (LoadInst *RLI : LIt->second) {
12557 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
12558 LI->getType(), LI->getPointerOperand(), *DL, *SE,
12559 /*StrictCheck=*/true))
12560 return hash_value(RLI->getPointerOperand());
12561 }
12562 for (LoadInst *RLI : LIt->second) {
12564 LI->getPointerOperand(), *TLI)) {
12565 hash_code SubKey = hash_value(RLI->getPointerOperand());
12566 return SubKey;
12567 }
12568 }
12569 if (LIt->second.size() > 2) {
12570 hash_code SubKey =
12571 hash_value(LIt->second.back()->getPointerOperand());
12572 return SubKey;
12573 }
12574 }
12575 }
12576 LoadKeyUsed.insert(Key);
12577 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
12578 return hash_value(LI->getPointerOperand());
12579 };
12580 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12581 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12582 bool IsOrdered = true;
12583 unsigned NumInstructions = 0;
12584 // Try to "cluster" scalar instructions, to be able to build extra vectorized
12585 // nodes.
12586 for (auto [I, V] : enumerate(TE.Scalars)) {
12587 size_t Key = 1, Idx = 1;
12588 if (auto *Inst = dyn_cast<Instruction>(V);
12590 !isDeleted(Inst) && !isVectorized(V)) {
12591 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
12592 /*AllowAlternate=*/false);
12593 ++NumInstructions;
12594 }
12595 auto &Container = SortedValues[Key];
12596 if (IsOrdered && !KeyToIndex.contains(V) &&
12599 ((Container.contains(Idx) &&
12600 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
12601 (!Container.empty() && !Container.contains(Idx) &&
12602 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
12603 IsOrdered = false;
12604 auto &KTI = KeyToIndex[V];
12605 if (KTI.empty())
12606 Container[Idx].push_back(V);
12607 KTI.push_back(I);
12608 }
12610 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12611 if (!IsOrdered && NumInstructions > 1) {
12612 unsigned Cnt = 0;
12613 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
12614 for (const auto &D : SortedValues) {
12615 for (const auto &P : D.second) {
12616 unsigned Sz = 0;
12617 for (Value *V : P.second) {
12618 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
12619 for (auto [K, Idx] : enumerate(Indices)) {
12620 TE.ReorderIndices[Cnt + K] = Idx;
12621 TE.Scalars[Cnt + K] = V;
12622 }
12623 Sz += Indices.size();
12624 Cnt += Indices.size();
12625 }
12626 if (Sz > 1 && isa<Instruction>(P.second.front())) {
12627 const unsigned SubVF = getFloorFullVectorNumberOfElements(
12628 *TTI, TE.Scalars.front()->getType(), Sz);
12629 SubVectors.emplace_back(Cnt - Sz, SubVF);
12630 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12631 DemandedElts.clearBit(I);
12632 } else if (!P.second.empty() && isConstant(P.second.front())) {
12633 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12634 DemandedElts.clearBit(I);
12635 }
12636 }
12637 }
12638 }
12639 // Reuses always require shuffles, so consider it as profitable.
12640 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
12641 return;
12642 // Do simple cost estimation.
12645 auto *ScalarTy = TE.Scalars.front()->getType();
12646 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
12647 for (auto [Idx, Sz] : SubVectors) {
12649 Idx, getWidenedType(ScalarTy, Sz));
12650 }
12651 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12652 /*Insert=*/true,
12653 /*Extract=*/false, CostKind);
12654 int Sz = TE.Scalars.size();
12655 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
12656 TE.ReorderIndices.end());
12657 for (unsigned I : seq<unsigned>(Sz)) {
12658 Value *V = TE.getOrdered(I);
12659 if (isa<PoisonValue>(V)) {
12660 ReorderMask[I] = PoisonMaskElem;
12661 } else if (isConstant(V) || DemandedElts[I]) {
12662 ReorderMask[I] = I + TE.ReorderIndices.size();
12663 }
12664 }
12665 Cost += ::getShuffleCost(*TTI,
12666 any_of(ReorderMask, [&](int I) { return I >= Sz; })
12669 VecTy, ReorderMask);
12670 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12671 ReorderMask.assign(Sz, PoisonMaskElem);
12672 for (unsigned I : seq<unsigned>(Sz)) {
12673 Value *V = TE.getOrdered(I);
12674 if (isConstant(V)) {
12675 DemandedElts.clearBit(I);
12676 if (!isa<PoisonValue>(V))
12677 ReorderMask[I] = I;
12678 } else {
12679 ReorderMask[I] = I + Sz;
12680 }
12681 }
12682 InstructionCost BVCost =
12683 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12684 /*Insert=*/true, /*Extract=*/false, CostKind);
12685 if (!DemandedElts.isAllOnes())
12686 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
12687 if (Cost >= BVCost) {
12688 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
12689 reorderScalars(TE.Scalars, Mask);
12690 TE.ReorderIndices.clear();
12691 }
12692}
12693
12694/// Check if we can convert fadd/fsub sequence to FMAD.
12695/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
12697 const InstructionsState &S,
12698 DominatorTree &DT, const DataLayout &DL,
12700 const TargetLibraryInfo &TLI) {
12701 assert(all_of(VL,
12702 [](Value *V) {
12703 return V->getType()->getScalarType()->isFloatingPointTy();
12704 }) &&
12705 "Can only convert to FMA for floating point types");
12706 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12707
12708 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12709 FastMathFlags FMF;
12710 FMF.set();
12711 for (Value *V : VL) {
12712 auto *I = dyn_cast<Instruction>(V);
12713 if (!I)
12714 continue;
12715 if (S.isCopyableElement(I))
12716 continue;
12717 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12718 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12719 continue;
12720 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12721 FMF &= FPCI->getFastMathFlags();
12722 }
12723 return FMF.allowContract();
12724 };
12725 if (!CheckForContractable(VL))
12727 // fmul also should be contractable
12728 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12729 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12730
12731 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
12732 if (!OpS.valid())
12734
12735 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12737 if (!CheckForContractable(Operands.front()))
12739 // Compare the costs.
12740 InstructionCost FMulPlusFAddCost = 0;
12741 InstructionCost FMACost = 0;
12743 FastMathFlags FMF;
12744 FMF.set();
12745 for (Value *V : VL) {
12746 auto *I = dyn_cast<Instruction>(V);
12747 if (!I)
12748 continue;
12749 if (!S.isCopyableElement(I))
12750 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12751 FMF &= FPCI->getFastMathFlags();
12752 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12753 }
12754 unsigned NumOps = 0;
12755 for (auto [V, Op] : zip(VL, Operands.front())) {
12756 if (S.isCopyableElement(V))
12757 continue;
12758 auto *I = dyn_cast<Instruction>(Op);
12759 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
12760 if (auto *OpI = dyn_cast<Instruction>(V))
12761 FMACost += TTI.getInstructionCost(OpI, CostKind);
12762 if (I)
12763 FMACost += TTI.getInstructionCost(I, CostKind);
12764 continue;
12765 }
12766 ++NumOps;
12767 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12768 FMF &= FPCI->getFastMathFlags();
12769 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12770 }
12771 Type *Ty = VL.front()->getType();
12772 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
12773 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
12774 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
12775}
12776
12779 BaseGraphSize = VectorizableTree.size();
12780 // Turn graph transforming mode on and off, when done.
12781 class GraphTransformModeRAAI {
12782 bool &SavedIsGraphTransformMode;
12783
12784 public:
12785 GraphTransformModeRAAI(bool &IsGraphTransformMode)
12786 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12787 IsGraphTransformMode = true;
12788 }
12789 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
12790 } TransformContext(IsGraphTransformMode);
12791 // Operands are profitable if they are:
12792 // 1. At least one constant
12793 // or
12794 // 2. Splats
12795 // or
12796 // 3. Results in good vectorization opportunity, i.e. may generate vector
12797 // nodes and reduce cost of the graph.
12798 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
12799 const InstructionsState &S) {
12801 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
12802 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
12803 I2->getOperand(Op));
12804 return all_of(
12805 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
12806 return all_of(Cand,
12807 [](const std::pair<Value *, Value *> &P) {
12808 return isa<Constant>(P.first) ||
12809 isa<Constant>(P.second) || P.first == P.second;
12810 }) ||
12812 });
12813 };
12814
12815 // Try to reorder gather nodes for better vectorization opportunities.
12816 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12817 TreeEntry &E = *VectorizableTree[Idx];
12818 if (E.isGather())
12819 reorderGatherNode(E);
12820 }
12821
12822 // Better to use full gathered loads analysis, if there are only 2 loads
12823 // gathered nodes each having less than 16 elements.
12824 constexpr unsigned VFLimit = 16;
12825 bool ForceLoadGather =
12826 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12827 return TE->isGather() && TE->hasState() &&
12828 TE->getOpcode() == Instruction::Load &&
12829 TE->getVectorFactor() < VFLimit;
12830 }) == 2;
12831
12832 // Checks if the scalars are used in other node.
12833 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
12834 function_ref<bool(Value *)> CheckContainer) {
12835 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
12836 if (isa<PoisonValue>(V))
12837 return true;
12838 auto *I = dyn_cast<Instruction>(V);
12839 if (!I)
12840 return false;
12841 return is_contained(TE->Scalars, I) || CheckContainer(I);
12842 });
12843 };
12844 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
12845 if (E.hasState()) {
12846 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
12847 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12848 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12849 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12850 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12851 return is_contained(TEs, TE);
12852 });
12853 });
12854 }))
12855 return true;
12856 ;
12857 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
12858 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12859 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12860 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12861 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12862 return is_contained(TEs, TE);
12863 });
12864 });
12865 }))
12866 return true;
12867 } else {
12868 // Check if the gather node full copy of split node.
12869 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
12870 if (It != E.Scalars.end()) {
12871 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
12872 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12873 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12874 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12875 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12876 return is_contained(TEs, TE);
12877 });
12878 });
12879 }))
12880 return true;
12881 }
12882 }
12883 return false;
12884 };
12885 // The tree may grow here, so iterate over nodes, built before.
12886 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12887 TreeEntry &E = *VectorizableTree[Idx];
12888 if (E.isGather()) {
12889 ArrayRef<Value *> VL = E.Scalars;
12890 const unsigned Sz = getVectorElementSize(VL.front());
12891 unsigned MinVF = getMinVF(2 * Sz);
12892 // Do not try partial vectorization for small nodes (<= 2), nodes with the
12893 // same opcode and same parent block or all constants.
12894 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
12895 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
12896 // We use allSameOpcode instead of isAltShuffle because we don't
12897 // want to use interchangeable instruction here.
12898 !allSameOpcode(VL) || !allSameBlock(VL)) ||
12899 allConstant(VL) || isSplat(VL))
12900 continue;
12901 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
12902 continue;
12903 // Check if the node is a copy of other vector nodes.
12904 if (CheckForSameVectorNodes(E))
12905 continue;
12906 // Try to find vectorizable sequences and transform them into a series of
12907 // insertvector instructions.
12908 unsigned StartIdx = 0;
12909 unsigned End = VL.size();
12910 for (unsigned VF = getFloorFullVectorNumberOfElements(
12911 *TTI, VL.front()->getType(), VL.size() - 1);
12912 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
12913 *TTI, VL.front()->getType(), VF - 1)) {
12914 if (StartIdx + VF > End)
12915 continue;
12917 bool AllStrided = true;
12918 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
12919 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
12920 // If any instruction is vectorized already - do not try again.
12921 // Reuse the existing node, if it fully matches the slice.
12922 if (isVectorized(Slice.front()) &&
12923 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
12924 continue;
12925 // Constant already handled effectively - skip.
12926 if (allConstant(Slice))
12927 continue;
12928 // Do not try to vectorize small splats (less than vector register and
12929 // only with the single non-undef element).
12930 bool IsSplat = isSplat(Slice);
12931 bool IsTwoRegisterSplat = true;
12932 if (IsSplat && VF == 2) {
12933 unsigned NumRegs2VF = ::getNumberOfParts(
12934 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
12935 IsTwoRegisterSplat = NumRegs2VF == 2;
12936 }
12937 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
12938 count(Slice, Slice.front()) ==
12939 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
12940 : 1)) {
12941 if (IsSplat)
12942 continue;
12943 InstructionsState S = getSameOpcode(Slice, *TLI);
12944 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
12945 (S.getOpcode() == Instruction::Load &&
12947 (S.getOpcode() != Instruction::Load &&
12948 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
12949 continue;
12950 if (VF == 2) {
12951 // Try to vectorize reduced values or if all users are vectorized.
12952 // For expensive instructions extra extracts might be profitable.
12953 if ((!UserIgnoreList || E.Idx != 0) &&
12954 TTI->getInstructionCost(S.getMainOp(), CostKind) <
12956 !all_of(Slice, [&](Value *V) {
12957 if (isa<PoisonValue>(V))
12958 return true;
12959 return areAllUsersVectorized(cast<Instruction>(V),
12960 UserIgnoreList);
12961 }))
12962 continue;
12963 if (S.getOpcode() == Instruction::Load) {
12964 OrdersType Order;
12965 SmallVector<Value *> PointerOps;
12966 StridedPtrInfo SPtrInfo;
12967 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
12968 PointerOps, SPtrInfo);
12969 AllStrided &= Res == LoadsState::StridedVectorize ||
12971 Res == LoadsState::Gather;
12972 // Do not vectorize gathers.
12973 if (Res == LoadsState::ScatterVectorize ||
12974 Res == LoadsState::Gather) {
12975 if (Res == LoadsState::Gather) {
12977 // If reductions and the scalars from the root node are
12978 // analyzed - mark as non-vectorizable reduction.
12979 if (UserIgnoreList && E.Idx == 0)
12980 analyzedReductionVals(Slice);
12981 }
12982 continue;
12983 }
12984 } else if (S.getOpcode() == Instruction::ExtractElement ||
12985 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
12987 !CheckOperandsProfitability(
12988 S.getMainOp(),
12991 S))) {
12992 // Do not vectorize extractelements (handled effectively
12993 // alread). Do not vectorize non-profitable instructions (with
12994 // low cost and non-vectorizable operands.)
12995 continue;
12996 }
12997 }
12998 }
12999 Slices.emplace_back(Cnt, Slice.size());
13000 }
13001 // Do not try to vectorize if all slides are strided or gathered with
13002 // vector factor 2 and there are more than 2 slices. Better to handle
13003 // them in gathered loads analysis, may result in better vectorization.
13004 if (VF == 2 && AllStrided && Slices.size() > 2)
13005 continue;
13006 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13007 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13008 if (StartIdx == Cnt)
13009 StartIdx = Cnt + Sz;
13010 if (End == Cnt + Sz)
13011 End = Cnt;
13012 };
13013 for (auto [Cnt, Sz] : Slices) {
13014 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13015 const TreeEntry *SameTE = nullptr;
13016 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13017 It != Slice.end()) {
13018 // If any instruction is vectorized already - do not try again.
13019 SameTE = getSameValuesTreeEntry(*It, Slice);
13020 }
13021 unsigned PrevSize = VectorizableTree.size();
13022 [[maybe_unused]] unsigned PrevEntriesSize =
13023 LoadEntriesToVectorize.size();
13024 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13025 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13026 VectorizableTree[PrevSize]->isGather() &&
13027 VectorizableTree[PrevSize]->hasState() &&
13028 VectorizableTree[PrevSize]->getOpcode() !=
13029 Instruction::ExtractElement &&
13030 !isSplat(Slice)) {
13031 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13032 analyzedReductionVals(Slice);
13033 VectorizableTree.pop_back();
13034 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13035 "LoadEntriesToVectorize expected to remain the same");
13036 continue;
13037 }
13038 AddCombinedNode(PrevSize, Cnt, Sz);
13039 }
13040 }
13041 // Restore ordering, if no extra vectorization happened.
13042 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13043 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13044 reorderScalars(E.Scalars, Mask);
13045 E.ReorderIndices.clear();
13046 }
13047 }
13048 if (!E.hasState())
13049 continue;
13050 switch (E.getOpcode()) {
13051 case Instruction::Load: {
13052 // No need to reorder masked gather loads, just reorder the scalar
13053 // operands.
13054 if (E.State != TreeEntry::Vectorize)
13055 break;
13056 Type *ScalarTy = E.getMainOp()->getType();
13057 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13058 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13059 // Check if profitable to represent consecutive load + reverse as strided
13060 // load with stride -1.
13061 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13062 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13063 SmallVector<int> Mask;
13064 inversePermutation(E.ReorderIndices, Mask);
13065 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13066 InstructionCost OriginalVecCost =
13067 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13068 BaseLI->getPointerAddressSpace(), CostKind,
13070 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13071 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13072 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13073 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
13074 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13075 // Strided load is more profitable than consecutive load + reverse -
13076 // transform the node to strided load.
13077 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13078 ->getPointerOperand()
13079 ->getType());
13080 StridedPtrInfo SPtrInfo;
13081 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13082 SPtrInfo.Ty = VecTy;
13083 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13084 E.State = TreeEntry::StridedVectorize;
13085 }
13086 }
13087 break;
13088 }
13089 case Instruction::Store: {
13090 Type *ScalarTy =
13091 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13092 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13093 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13094 // Check if profitable to represent consecutive load + reverse as strided
13095 // load with stride -1.
13096 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13097 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13098 SmallVector<int> Mask;
13099 inversePermutation(E.ReorderIndices, Mask);
13100 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13101 InstructionCost OriginalVecCost =
13102 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13103 BaseSI->getPointerAddressSpace(), CostKind,
13105 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13106 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13107 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13108 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
13109 if (StridedCost < OriginalVecCost)
13110 // Strided store is more profitable than reverse + consecutive store -
13111 // transform the node to strided store.
13112 E.State = TreeEntry::StridedVectorize;
13113 } else if (!E.ReorderIndices.empty()) {
13114 // Check for interleaved stores.
13115 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13116 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13117 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13118 if (Mask.size() < 4)
13119 return 0u;
13120 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13122 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13123 TTI.isLegalInterleavedAccessType(
13124 VecTy, Factor, BaseSI->getAlign(),
13125 BaseSI->getPointerAddressSpace()))
13126 return Factor;
13127 }
13128
13129 return 0u;
13130 };
13131 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13132 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13133 if (InterleaveFactor != 0)
13134 E.setInterleave(InterleaveFactor);
13135 }
13136 break;
13137 }
13138 case Instruction::Select: {
13139 if (E.State != TreeEntry::Vectorize)
13140 break;
13141 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13142 if (MinMaxID == Intrinsic::not_intrinsic)
13143 break;
13144 // This node is a minmax node.
13145 E.CombinedOp = TreeEntry::MinMax;
13146 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13147 if (SelectOnly && CondEntry->UserTreeIndex &&
13148 CondEntry->State == TreeEntry::Vectorize) {
13149 // The condition node is part of the combined minmax node.
13150 CondEntry->State = TreeEntry::CombinedVectorize;
13151 }
13152 break;
13153 }
13154 case Instruction::FSub:
13155 case Instruction::FAdd: {
13156 // Check if possible to convert (a*b)+c to fma.
13157 if (E.State != TreeEntry::Vectorize ||
13158 !E.getOperations().isAddSubLikeOp())
13159 break;
13160 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13161 .isValid())
13162 break;
13163 // This node is a fmuladd node.
13164 E.CombinedOp = TreeEntry::FMulAdd;
13165 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13166 if (FMulEntry->UserTreeIndex &&
13167 FMulEntry->State == TreeEntry::Vectorize) {
13168 // The FMul node is part of the combined fmuladd node.
13169 FMulEntry->State = TreeEntry::CombinedVectorize;
13170 }
13171 break;
13172 }
13173 default:
13174 break;
13175 }
13176 }
13177
13178 if (LoadEntriesToVectorize.empty()) {
13179 // Single load node - exit.
13180 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13181 VectorizableTree.front()->getOpcode() == Instruction::Load)
13182 return;
13183 // Small graph with small VF - exit.
13184 constexpr unsigned SmallTree = 3;
13185 constexpr unsigned SmallVF = 2;
13186 if ((VectorizableTree.size() <= SmallTree &&
13187 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13188 (VectorizableTree.size() <= 2 && UserIgnoreList))
13189 return;
13190
13191 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13192 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13193 getCanonicalGraphSize() <= SmallTree &&
13194 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13195 [](const std::unique_ptr<TreeEntry> &TE) {
13196 return TE->isGather() && TE->hasState() &&
13197 TE->getOpcode() == Instruction::Load &&
13198 !allSameBlock(TE->Scalars);
13199 }) == 1)
13200 return;
13201 }
13202
13203 // A list of loads to be gathered during the vectorization process. We can
13204 // try to vectorize them at the end, if profitable.
13205 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13207 GatheredLoads;
13208
13209 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13210 TreeEntry &E = *TE;
13211 if (E.isGather() &&
13212 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13213 (!E.hasState() && any_of(E.Scalars,
13214 [&](Value *V) {
13215 return isa<LoadInst>(V) &&
13216 !isVectorized(V) &&
13217 !isDeleted(cast<Instruction>(V));
13218 }))) &&
13219 !isSplat(E.Scalars)) {
13220 for (Value *V : E.Scalars) {
13221 auto *LI = dyn_cast<LoadInst>(V);
13222 if (!LI)
13223 continue;
13224 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13225 continue;
13227 *this, V, *DL, *SE, *TTI,
13228 GatheredLoads[std::make_tuple(
13229 LI->getParent(),
13230 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13231 LI->getType())]);
13232 }
13233 }
13234 }
13235 // Try to vectorize gathered loads if this is not just a gather of loads.
13236 if (!GatheredLoads.empty())
13237 tryToVectorizeGatheredLoads(GatheredLoads);
13238}
13239
13240/// Merges shuffle masks and emits final shuffle instruction, if required. It
13241/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13242/// when the actual shuffle instruction is generated only if this is actually
13243/// required. Otherwise, the shuffle instruction emission is delayed till the
13244/// end of the process, to reduce the number of emitted instructions and further
13245/// analysis/transformations.
13246class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13247 bool IsFinalized = false;
13248 SmallVector<int> CommonMask;
13250 const TargetTransformInfo &TTI;
13251 InstructionCost Cost = 0;
13252 SmallDenseSet<Value *> VectorizedVals;
13253 BoUpSLP &R;
13254 SmallPtrSetImpl<Value *> &CheckedExtracts;
13255 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13256 /// While set, still trying to estimate the cost for the same nodes and we
13257 /// can delay actual cost estimation (virtual shuffle instruction emission).
13258 /// May help better estimate the cost if same nodes must be permuted + allows
13259 /// to move most of the long shuffles cost estimation to TTI.
13260 bool SameNodesEstimated = true;
13261
13262 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13263 if (Ty->getScalarType()->isPointerTy()) {
13266 IntegerType::get(Ty->getContext(),
13267 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13268 Ty->getScalarType());
13269 if (auto *VTy = dyn_cast<VectorType>(Ty))
13270 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13271 return Res;
13272 }
13273 return Constant::getAllOnesValue(Ty);
13274 }
13275
13276 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13277 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13278 return TTI::TCC_Free;
13279 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13280 InstructionCost GatherCost = 0;
13281 SmallVector<Value *> Gathers(VL);
13282 if (!Root && isSplat(VL)) {
13283 // Found the broadcasting of the single scalar, calculate the cost as
13284 // the broadcast.
13285 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13286 assert(It != VL.end() && "Expected at least one non-undef value.");
13287 // Add broadcast for non-identity shuffle only.
13288 bool NeedShuffle =
13289 count(VL, *It) > 1 &&
13290 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13291 if (!NeedShuffle) {
13292 if (isa<FixedVectorType>(ScalarTy)) {
13293 assert(SLPReVec && "FixedVectorType is not expected.");
13294 return TTI.getShuffleCost(
13295 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13296 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13297 cast<FixedVectorType>(ScalarTy));
13298 }
13299 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13300 CostKind, std::distance(VL.begin(), It),
13301 PoisonValue::get(VecTy), *It);
13302 }
13303
13304 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13305 transform(VL, ShuffleMask.begin(), [](Value *V) {
13306 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13307 });
13308 InstructionCost InsertCost =
13309 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13310 PoisonValue::get(VecTy), *It);
13311 return InsertCost + ::getShuffleCost(TTI,
13313 VecTy, ShuffleMask, CostKind,
13314 /*Index=*/0, /*SubTp=*/nullptr,
13315 /*Args=*/*It);
13316 }
13317 return GatherCost +
13318 (all_of(Gathers, IsaPred<UndefValue>)
13320 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13321 ScalarTy));
13322 };
13323
13324 /// Compute the cost of creating a vector containing the extracted values from
13325 /// \p VL.
13327 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13328 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13329 unsigned NumParts) {
13330 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13331 unsigned NumElts =
13332 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13333 auto *EE = dyn_cast<ExtractElementInst>(V);
13334 if (!EE)
13335 return Sz;
13336 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13337 if (!VecTy)
13338 return Sz;
13339 return std::max(Sz, VecTy->getNumElements());
13340 });
13341 // FIXME: this must be moved to TTI for better estimation.
13342 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13343 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13345 SmallVectorImpl<unsigned> &SubVecSizes)
13346 -> std::optional<TTI::ShuffleKind> {
13347 if (NumElts <= EltsPerVector)
13348 return std::nullopt;
13349 int OffsetReg0 =
13350 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13351 [](int S, int I) {
13352 if (I == PoisonMaskElem)
13353 return S;
13354 return std::min(S, I);
13355 }),
13356 EltsPerVector);
13357 int OffsetReg1 = OffsetReg0;
13358 DenseSet<int> RegIndices;
13359 // Check that if trying to permute same single/2 input vectors.
13361 int FirstRegId = -1;
13362 Indices.assign(1, OffsetReg0);
13363 for (auto [Pos, I] : enumerate(Mask)) {
13364 if (I == PoisonMaskElem)
13365 continue;
13366 int Idx = I - OffsetReg0;
13367 int RegId =
13368 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13369 if (FirstRegId < 0)
13370 FirstRegId = RegId;
13371 RegIndices.insert(RegId);
13372 if (RegIndices.size() > 2)
13373 return std::nullopt;
13374 if (RegIndices.size() == 2) {
13375 ShuffleKind = TTI::SK_PermuteTwoSrc;
13376 if (Indices.size() == 1) {
13377 OffsetReg1 = alignDown(
13378 std::accumulate(
13379 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13380 [&](int S, int I) {
13381 if (I == PoisonMaskElem)
13382 return S;
13383 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13384 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13385 if (RegId == FirstRegId)
13386 return S;
13387 return std::min(S, I);
13388 }),
13389 EltsPerVector);
13390 unsigned Index = OffsetReg1 % NumElts;
13391 Indices.push_back(Index);
13392 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13393 }
13394 Idx = I - OffsetReg1;
13395 }
13396 I = (Idx % NumElts) % EltsPerVector +
13397 (RegId == FirstRegId ? 0 : EltsPerVector);
13398 }
13399 return ShuffleKind;
13400 };
13401 InstructionCost Cost = 0;
13402
13403 // Process extracts in blocks of EltsPerVector to check if the source vector
13404 // operand can be re-used directly. If not, add the cost of creating a
13405 // shuffle to extract the values into a vector register.
13406 for (unsigned Part : seq<unsigned>(NumParts)) {
13407 if (!ShuffleKinds[Part])
13408 continue;
13409 ArrayRef<int> MaskSlice = Mask.slice(
13410 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13411 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13412 copy(MaskSlice, SubMask.begin());
13414 SmallVector<unsigned, 2> SubVecSizes;
13415 std::optional<TTI::ShuffleKind> RegShuffleKind =
13416 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13417 if (!RegShuffleKind) {
13418 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13420 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13421 Cost +=
13422 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13423 getWidenedType(ScalarTy, NumElts), MaskSlice);
13424 continue;
13425 }
13426 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13427 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13428 Cost +=
13429 ::getShuffleCost(TTI, *RegShuffleKind,
13430 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13431 }
13432 const unsigned BaseVF = getFullVectorNumberOfElements(
13433 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13434 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13435 assert((Idx + SubVecSize) <= BaseVF &&
13436 "SK_ExtractSubvector index out of range");
13438 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13439 Idx, getWidenedType(ScalarTy, SubVecSize));
13440 }
13441 // Second attempt to check, if just a permute is better estimated than
13442 // subvector extract.
13443 SubMask.assign(NumElts, PoisonMaskElem);
13444 copy(MaskSlice, SubMask.begin());
13445 InstructionCost OriginalCost = ::getShuffleCost(
13446 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13447 if (OriginalCost < Cost)
13448 Cost = OriginalCost;
13449 }
13450 return Cost;
13451 }
13452 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13453 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13454 /// elements.
13455 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13456 ArrayRef<int> Mask, unsigned Part,
13457 unsigned SliceSize) {
13458 if (SameNodesEstimated) {
13459 // Delay the cost estimation if the same nodes are reshuffling.
13460 // If we already requested the cost of reshuffling of E1 and E2 before, no
13461 // need to estimate another cost with the sub-Mask, instead include this
13462 // sub-Mask into the CommonMask to estimate it later and avoid double cost
13463 // estimation.
13464 if ((InVectors.size() == 2 &&
13465 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
13466 cast<const TreeEntry *>(InVectors.back()) == E2) ||
13467 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
13468 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
13469 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
13470 [](int Idx) { return Idx == PoisonMaskElem; }) &&
13471 "Expected all poisoned elements.");
13472 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
13473 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13474 return;
13475 }
13476 // Found non-matching nodes - need to estimate the cost for the matched
13477 // and transform mask.
13478 Cost += createShuffle(InVectors.front(),
13479 InVectors.size() == 1 ? nullptr : InVectors.back(),
13480 CommonMask);
13481 transformMaskAfterShuffle(CommonMask, CommonMask);
13482 } else if (InVectors.size() == 2) {
13483 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13484 transformMaskAfterShuffle(CommonMask, CommonMask);
13485 }
13486 SameNodesEstimated = false;
13487 if (!E2 && InVectors.size() == 1) {
13488 unsigned VF = E1.getVectorFactor();
13489 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
13490 VF = std::max(VF, getVF(V1));
13491 } else {
13492 const auto *E = cast<const TreeEntry *>(InVectors.front());
13493 VF = std::max(VF, E->getVectorFactor());
13494 }
13495 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13496 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13497 CommonMask[Idx] = Mask[Idx] + VF;
13498 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13499 transformMaskAfterShuffle(CommonMask, CommonMask);
13500 } else {
13501 auto P = InVectors.front();
13502 Cost += createShuffle(&E1, E2, Mask);
13503 unsigned VF = Mask.size();
13504 if (Value *V1 = dyn_cast<Value *>(P)) {
13505 VF = std::max(VF,
13506 getNumElements(V1->getType()));
13507 } else {
13508 const auto *E = cast<const TreeEntry *>(P);
13509 VF = std::max(VF, E->getVectorFactor());
13510 }
13511 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13512 if (Mask[Idx] != PoisonMaskElem)
13513 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13514 Cost += createShuffle(P, InVectors.front(), CommonMask);
13515 transformMaskAfterShuffle(CommonMask, CommonMask);
13516 }
13517 }
13518
13519 class ShuffleCostBuilder {
13520 const TargetTransformInfo &TTI;
13521
13522 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
13523 int Index = -1;
13524 return Mask.empty() ||
13525 (VF == Mask.size() &&
13528 Index == 0);
13529 }
13530
13531 public:
13532 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
13533 ~ShuffleCostBuilder() = default;
13534 InstructionCost createShuffleVector(Value *V1, Value *,
13535 ArrayRef<int> Mask) const {
13536 // Empty mask or identity mask are free.
13537 unsigned VF =
13538 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13539 if (isEmptyOrIdentity(Mask, VF))
13540 return TTI::TCC_Free;
13541 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
13542 cast<VectorType>(V1->getType()), Mask);
13543 }
13544 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
13545 // Empty mask or identity mask are free.
13546 unsigned VF =
13547 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13548 if (isEmptyOrIdentity(Mask, VF))
13549 return TTI::TCC_Free;
13550 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
13551 cast<VectorType>(V1->getType()), Mask);
13552 }
13553 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
13554 InstructionCost createPoison(Type *Ty, unsigned VF) const {
13555 return TTI::TCC_Free;
13556 }
13557 void resizeToMatch(Value *&, Value *&) const {}
13558 };
13559
13560 /// Smart shuffle instruction emission, walks through shuffles trees and
13561 /// tries to find the best matching vector for the actual shuffle
13562 /// instruction.
13564 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
13566 ArrayRef<int> Mask) {
13567 ShuffleCostBuilder Builder(TTI);
13568 SmallVector<int> CommonMask(Mask);
13569 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
13570 unsigned CommonVF = Mask.size();
13571 InstructionCost ExtraCost = 0;
13572 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
13573 unsigned VF) -> InstructionCost {
13574 if (E.isGather() && allConstant(E.Scalars))
13575 return TTI::TCC_Free;
13576 Type *EScalarTy = E.Scalars.front()->getType();
13577 bool IsSigned = true;
13578 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13579 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
13580 IsSigned = It->second.second;
13581 }
13582 if (EScalarTy != ScalarTy) {
13583 unsigned CastOpcode = Instruction::Trunc;
13584 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13585 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13586 if (DstSz > SrcSz)
13587 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13588 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
13589 getWidenedType(EScalarTy, VF),
13590 TTI::CastContextHint::None, CostKind);
13591 }
13592 return TTI::TCC_Free;
13593 };
13594 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
13595 if (isa<Constant>(V))
13596 return TTI::TCC_Free;
13597 auto *VecTy = cast<VectorType>(V->getType());
13598 Type *EScalarTy = VecTy->getElementType();
13599 if (EScalarTy != ScalarTy) {
13600 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
13601 unsigned CastOpcode = Instruction::Trunc;
13602 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13603 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13604 if (DstSz > SrcSz)
13605 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13606 return TTI.getCastInstrCost(
13607 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
13608 VecTy, TTI::CastContextHint::None, CostKind);
13609 }
13610 return TTI::TCC_Free;
13611 };
13612 if (!V1 && !V2 && !P2.isNull()) {
13613 // Shuffle 2 entry nodes.
13614 const TreeEntry *E = cast<const TreeEntry *>(P1);
13615 unsigned VF = E->getVectorFactor();
13616 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13617 CommonVF = std::max(VF, E2->getVectorFactor());
13618 assert(all_of(Mask,
13619 [=](int Idx) {
13620 return Idx < 2 * static_cast<int>(CommonVF);
13621 }) &&
13622 "All elements in mask must be less than 2 * CommonVF.");
13623 if (E->Scalars.size() == E2->Scalars.size()) {
13624 SmallVector<int> EMask = E->getCommonMask();
13625 SmallVector<int> E2Mask = E2->getCommonMask();
13626 if (!EMask.empty() || !E2Mask.empty()) {
13627 for (int &Idx : CommonMask) {
13628 if (Idx == PoisonMaskElem)
13629 continue;
13630 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
13631 Idx = EMask[Idx];
13632 else if (Idx >= static_cast<int>(CommonVF))
13633 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13634 E->Scalars.size();
13635 }
13636 }
13637 CommonVF = E->Scalars.size();
13638 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13639 GetNodeMinBWAffectedCost(*E2, CommonVF);
13640 } else {
13641 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13642 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13643 }
13644 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13645 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13646 } else if (!V1 && P2.isNull()) {
13647 // Shuffle single entry node.
13648 const TreeEntry *E = cast<const TreeEntry *>(P1);
13649 unsigned VF = E->getVectorFactor();
13650 CommonVF = VF;
13651 assert(
13652 all_of(Mask,
13653 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13654 "All elements in mask must be less than CommonVF.");
13655 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13656 SmallVector<int> EMask = E->getCommonMask();
13657 assert(!EMask.empty() && "Expected non-empty common mask.");
13658 for (int &Idx : CommonMask) {
13659 if (Idx != PoisonMaskElem)
13660 Idx = EMask[Idx];
13661 }
13662 CommonVF = E->Scalars.size();
13663 } else if (unsigned Factor = E->getInterleaveFactor();
13664 Factor > 0 && E->Scalars.size() != Mask.size() &&
13666 Factor)) {
13667 // Deinterleaved nodes are free.
13668 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13669 }
13670 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13671 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13672 // Not identity/broadcast? Try to see if the original vector is better.
13673 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13674 CommonVF == CommonMask.size() &&
13675 any_of(enumerate(CommonMask),
13676 [](const auto &&P) {
13677 return P.value() != PoisonMaskElem &&
13678 static_cast<unsigned>(P.value()) != P.index();
13679 }) &&
13680 any_of(CommonMask,
13681 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
13682 SmallVector<int> ReorderMask;
13683 inversePermutation(E->ReorderIndices, ReorderMask);
13684 ::addMask(CommonMask, ReorderMask);
13685 }
13686 } else if (V1 && P2.isNull()) {
13687 // Shuffle single vector.
13688 ExtraCost += GetValueMinBWAffectedCost(V1);
13689 CommonVF = getVF(V1);
13690 assert(
13691 all_of(Mask,
13692 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13693 "All elements in mask must be less than CommonVF.");
13694 } else if (V1 && !V2) {
13695 // Shuffle vector and tree node.
13696 unsigned VF = getVF(V1);
13697 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13698 CommonVF = std::max(VF, E2->getVectorFactor());
13699 assert(all_of(Mask,
13700 [=](int Idx) {
13701 return Idx < 2 * static_cast<int>(CommonVF);
13702 }) &&
13703 "All elements in mask must be less than 2 * CommonVF.");
13704 if (E2->Scalars.size() == VF && VF != CommonVF) {
13705 SmallVector<int> E2Mask = E2->getCommonMask();
13706 assert(!E2Mask.empty() && "Expected non-empty common mask.");
13707 for (int &Idx : CommonMask) {
13708 if (Idx == PoisonMaskElem)
13709 continue;
13710 if (Idx >= static_cast<int>(CommonVF))
13711 Idx = E2Mask[Idx - CommonVF] + VF;
13712 }
13713 CommonVF = VF;
13714 }
13715 ExtraCost += GetValueMinBWAffectedCost(V1);
13716 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13717 ExtraCost += GetNodeMinBWAffectedCost(
13718 *E2, std::min(CommonVF, E2->getVectorFactor()));
13719 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13720 } else if (!V1 && V2) {
13721 // Shuffle vector and tree node.
13722 unsigned VF = getVF(V2);
13723 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13724 CommonVF = std::max(VF, E1->getVectorFactor());
13725 assert(all_of(Mask,
13726 [=](int Idx) {
13727 return Idx < 2 * static_cast<int>(CommonVF);
13728 }) &&
13729 "All elements in mask must be less than 2 * CommonVF.");
13730 if (E1->Scalars.size() == VF && VF != CommonVF) {
13731 SmallVector<int> E1Mask = E1->getCommonMask();
13732 assert(!E1Mask.empty() && "Expected non-empty common mask.");
13733 for (int &Idx : CommonMask) {
13734 if (Idx == PoisonMaskElem)
13735 continue;
13736 if (Idx >= static_cast<int>(CommonVF))
13737 Idx = E1Mask[Idx - CommonVF] + VF;
13738 else
13739 Idx = E1Mask[Idx];
13740 }
13741 CommonVF = VF;
13742 }
13743 ExtraCost += GetNodeMinBWAffectedCost(
13744 *E1, std::min(CommonVF, E1->getVectorFactor()));
13745 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13746 ExtraCost += GetValueMinBWAffectedCost(V2);
13747 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13748 } else {
13749 assert(V1 && V2 && "Expected both vectors.");
13750 unsigned VF = getVF(V1);
13751 CommonVF = std::max(VF, getVF(V2));
13752 assert(all_of(Mask,
13753 [=](int Idx) {
13754 return Idx < 2 * static_cast<int>(CommonVF);
13755 }) &&
13756 "All elements in mask must be less than 2 * CommonVF.");
13757 ExtraCost +=
13758 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13759 if (V1->getType() != V2->getType()) {
13760 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13761 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13762 } else {
13763 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
13764 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13765 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
13766 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13767 }
13768 }
13769 InVectors.front() =
13770 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13771 if (InVectors.size() == 2)
13772 InVectors.pop_back();
13773 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13774 V1, V2, CommonMask, Builder, ScalarTy);
13775 }
13776
13777public:
13779 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
13780 SmallPtrSetImpl<Value *> &CheckedExtracts)
13781 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13782 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13783 CheckedExtracts(CheckedExtracts) {}
13784 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
13785 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13786 unsigned NumParts, bool &UseVecBaseAsInput) {
13787 UseVecBaseAsInput = false;
13788 if (Mask.empty())
13789 return nullptr;
13790 Value *VecBase = nullptr;
13791 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
13792 if (!E->ReorderIndices.empty()) {
13793 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
13794 E->ReorderIndices.end());
13795 reorderScalars(VL, ReorderMask);
13796 }
13797 // Check if it can be considered reused if same extractelements were
13798 // vectorized already.
13799 bool PrevNodeFound = any_of(
13800 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13801 [&](const std::unique_ptr<TreeEntry> &TE) {
13802 return ((TE->hasState() && !TE->isAltShuffle() &&
13803 TE->getOpcode() == Instruction::ExtractElement) ||
13804 TE->isGather()) &&
13805 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13806 return VL.size() > Data.index() &&
13807 (Mask[Data.index()] == PoisonMaskElem ||
13808 isa<UndefValue>(VL[Data.index()]) ||
13809 Data.value() == VL[Data.index()]);
13810 });
13811 });
13812 SmallPtrSet<Value *, 4> UniqueBases;
13813 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13814 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
13815 for (unsigned Part : seq<unsigned>(NumParts)) {
13816 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
13817 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13818 for (auto [I, V] :
13819 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
13820 // Ignore non-extractelement scalars.
13821 if (isa<UndefValue>(V) ||
13822 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
13823 continue;
13824 // If all users of instruction are going to be vectorized and this
13825 // instruction itself is not going to be vectorized, consider this
13826 // instruction as dead and remove its cost from the final cost of the
13827 // vectorized tree.
13828 // Also, avoid adjusting the cost for extractelements with multiple uses
13829 // in different graph entries.
13830 auto *EE = cast<ExtractElementInst>(V);
13831 VecBase = EE->getVectorOperand();
13832 UniqueBases.insert(VecBase);
13833 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
13834 if (!CheckedExtracts.insert(V).second ||
13835 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
13836 any_of(EE->users(),
13837 [&](User *U) {
13838 return isa<GetElementPtrInst>(U) &&
13839 !R.areAllUsersVectorized(cast<Instruction>(U),
13840 &VectorizedVals);
13841 }) ||
13842 (!VEs.empty() && !is_contained(VEs, E)))
13843 continue;
13844 std::optional<unsigned> EEIdx = getExtractIndex(EE);
13845 if (!EEIdx)
13846 continue;
13847 unsigned Idx = *EEIdx;
13848 // Take credit for instruction that will become dead.
13849 if (EE->hasOneUse() || !PrevNodeFound) {
13850 Instruction *Ext = EE->user_back();
13851 if (isa<SExtInst, ZExtInst>(Ext) &&
13852 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
13853 // Use getExtractWithExtendCost() to calculate the cost of
13854 // extractelement/ext pair.
13855 Cost -= TTI.getExtractWithExtendCost(
13856 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13857 Idx, CostKind);
13858 // Add back the cost of s|zext which is subtracted separately.
13859 Cost += TTI.getCastInstrCost(
13860 Ext->getOpcode(), Ext->getType(), EE->getType(),
13862 continue;
13863 }
13864 }
13865 APInt &DemandedElts =
13866 VectorOpsToExtracts
13867 .try_emplace(VecBase,
13868 APInt::getZero(getNumElements(VecBase->getType())))
13869 .first->getSecond();
13870 DemandedElts.setBit(Idx);
13871 }
13872 }
13873 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13875 DemandedElts, /*Insert=*/false,
13876 /*Extract=*/true, CostKind);
13877 // Check that gather of extractelements can be represented as just a
13878 // shuffle of a single/two vectors the scalars are extracted from.
13879 // Found the bunch of extractelement instructions that must be gathered
13880 // into a vector and can be represented as a permutation elements in a
13881 // single input vector or of 2 input vectors.
13882 // Done for reused if same extractelements were vectorized already.
13883 if (!PrevNodeFound)
13884 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13885 InVectors.assign(1, E);
13886 CommonMask.assign(Mask.begin(), Mask.end());
13887 transformMaskAfterShuffle(CommonMask, CommonMask);
13888 SameNodesEstimated = false;
13889 if (NumParts != 1 && UniqueBases.size() != 1) {
13890 UseVecBaseAsInput = true;
13891 VecBase =
13892 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13893 }
13894 return VecBase;
13895 }
13896 /// Checks if the specified entry \p E needs to be delayed because of its
13897 /// dependency nodes.
13898 std::optional<InstructionCost>
13899 needToDelay(const TreeEntry *,
13901 // No need to delay the cost estimation during analysis.
13902 return std::nullopt;
13903 }
13904 /// Reset the builder to handle perfect diamond match.
13906 IsFinalized = false;
13907 CommonMask.clear();
13908 InVectors.clear();
13909 Cost = 0;
13910 VectorizedVals.clear();
13911 SameNodesEstimated = true;
13912 }
13913 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
13914 if (&E1 == &E2) {
13915 assert(all_of(Mask,
13916 [&](int Idx) {
13917 return Idx < static_cast<int>(E1.getVectorFactor());
13918 }) &&
13919 "Expected single vector shuffle mask.");
13920 add(E1, Mask);
13921 return;
13922 }
13923 if (InVectors.empty()) {
13924 CommonMask.assign(Mask.begin(), Mask.end());
13925 InVectors.assign({&E1, &E2});
13926 return;
13927 }
13928 assert(!CommonMask.empty() && "Expected non-empty common mask.");
13929 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
13930 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
13931 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
13932 const auto *It =
13933 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
13934 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13935 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13936 }
13937 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
13938 if (InVectors.empty()) {
13939 CommonMask.assign(Mask.begin(), Mask.end());
13940 InVectors.assign(1, &E1);
13941 return;
13942 }
13943 assert(!CommonMask.empty() && "Expected non-empty common mask.");
13944 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
13945 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
13946 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
13947 const auto *It =
13948 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
13949 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13950 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
13951 if (!SameNodesEstimated && InVectors.size() == 1)
13952 InVectors.emplace_back(&E1);
13953 }
13954 /// Adds 2 input vectors and the mask for their shuffling.
13955 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
13956 // May come only for shuffling of 2 vectors with extractelements, already
13957 // handled in adjustExtracts.
13958 assert(InVectors.size() == 1 &&
13959 all_of(enumerate(CommonMask),
13960 [&](auto P) {
13961 if (P.value() == PoisonMaskElem)
13962 return Mask[P.index()] == PoisonMaskElem;
13963 auto *EI = cast<ExtractElementInst>(
13964 cast<const TreeEntry *>(InVectors.front())
13965 ->getOrdered(P.index()));
13966 return EI->getVectorOperand() == V1 ||
13967 EI->getVectorOperand() == V2;
13968 }) &&
13969 "Expected extractelement vectors.");
13970 }
13971 /// Adds another one input vector and the mask for the shuffling.
13972 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
13973 if (InVectors.empty()) {
13974 assert(CommonMask.empty() && !ForExtracts &&
13975 "Expected empty input mask/vectors.");
13976 CommonMask.assign(Mask.begin(), Mask.end());
13977 InVectors.assign(1, V1);
13978 return;
13979 }
13980 if (ForExtracts) {
13981 // No need to add vectors here, already handled them in adjustExtracts.
13982 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
13983 !CommonMask.empty() &&
13984 all_of(enumerate(CommonMask),
13985 [&](auto P) {
13986 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
13987 ->getOrdered(P.index());
13988 if (P.value() == PoisonMaskElem)
13989 return P.value() == Mask[P.index()] ||
13990 isa<UndefValue>(Scalar);
13991 if (isa<Constant>(V1))
13992 return true;
13993 auto *EI = cast<ExtractElementInst>(Scalar);
13994 return EI->getVectorOperand() == V1;
13995 }) &&
13996 "Expected only tree entry for extractelement vectors.");
13997 return;
13998 }
13999 assert(!InVectors.empty() && !CommonMask.empty() &&
14000 "Expected only tree entries from extracts/reused buildvectors.");
14001 unsigned VF = getVF(V1);
14002 if (InVectors.size() == 2) {
14003 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14004 transformMaskAfterShuffle(CommonMask, CommonMask);
14005 VF = std::max<unsigned>(VF, CommonMask.size());
14006 } else if (const auto *InTE =
14007 InVectors.front().dyn_cast<const TreeEntry *>()) {
14008 VF = std::max(VF, InTE->getVectorFactor());
14009 } else {
14010 VF = std::max(
14011 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14012 ->getNumElements());
14013 }
14014 InVectors.push_back(V1);
14015 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14016 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14017 CommonMask[Idx] = Mask[Idx] + VF;
14018 }
14019 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14020 Value *Root = nullptr) {
14021 Cost += getBuildVectorCost(VL, Root);
14022 if (!Root) {
14023 // FIXME: Need to find a way to avoid use of getNullValue here.
14025 unsigned VF = VL.size();
14026 if (MaskVF != 0)
14027 VF = std::min(VF, MaskVF);
14028 Type *VLScalarTy = VL.front()->getType();
14029 for (Value *V : VL.take_front(VF)) {
14030 Type *ScalarTy = VLScalarTy->getScalarType();
14031 if (isa<PoisonValue>(V)) {
14032 Vals.push_back(PoisonValue::get(ScalarTy));
14033 continue;
14034 }
14035 if (isa<UndefValue>(V)) {
14036 Vals.push_back(UndefValue::get(ScalarTy));
14037 continue;
14038 }
14039 Vals.push_back(Constant::getNullValue(ScalarTy));
14040 }
14041 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14042 assert(SLPReVec && "FixedVectorType is not expected.");
14043 // When REVEC is enabled, we need to expand vector types into scalar
14044 // types.
14045 Vals = replicateMask(Vals, VecTy->getNumElements());
14046 }
14047 return ConstantVector::get(Vals);
14048 }
14051 cast<FixedVectorType>(Root->getType())->getNumElements()),
14052 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14053 }
14055 /// Finalize emission of the shuffles.
14057 ArrayRef<int> ExtMask,
14058 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14059 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14062 Action = {}) {
14063 IsFinalized = true;
14064 if (Action) {
14065 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14066 if (InVectors.size() == 2)
14067 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14068 else
14069 Cost += createShuffle(Vec, nullptr, CommonMask);
14070 transformMaskAfterShuffle(CommonMask, CommonMask);
14071 assert(VF > 0 &&
14072 "Expected vector length for the final value before action.");
14073 Value *V = cast<Value *>(Vec);
14074 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14075 Cost += createShuffle(V1, V2, Mask);
14076 return V1;
14077 });
14078 InVectors.front() = V;
14079 }
14080 if (!SubVectors.empty()) {
14081 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14082 if (InVectors.size() == 2)
14083 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14084 else
14085 Cost += createShuffle(Vec, nullptr, CommonMask);
14086 transformMaskAfterShuffle(CommonMask, CommonMask);
14087 // Add subvectors permutation cost.
14088 if (!SubVectorsMask.empty()) {
14089 assert(SubVectorsMask.size() <= CommonMask.size() &&
14090 "Expected same size of masks for subvectors and common mask.");
14091 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14092 copy(SubVectorsMask, SVMask.begin());
14093 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14094 if (I2 != PoisonMaskElem) {
14095 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14096 I1 = I2 + CommonMask.size();
14097 }
14098 }
14100 getWidenedType(ScalarTy, CommonMask.size()),
14101 SVMask, CostKind);
14102 }
14103 for (auto [E, Idx] : SubVectors) {
14104 Type *EScalarTy = E->Scalars.front()->getType();
14105 bool IsSigned = true;
14106 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14107 EScalarTy =
14108 IntegerType::get(EScalarTy->getContext(), It->second.first);
14109 IsSigned = It->second.second;
14110 }
14111 if (ScalarTy != EScalarTy) {
14112 unsigned CastOpcode = Instruction::Trunc;
14113 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14114 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14115 if (DstSz > SrcSz)
14116 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14117 Cost += TTI.getCastInstrCost(
14118 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14119 getWidenedType(EScalarTy, E->getVectorFactor()),
14121 }
14124 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14125 getWidenedType(ScalarTy, E->getVectorFactor()));
14126 if (!CommonMask.empty()) {
14127 std::iota(std::next(CommonMask.begin(), Idx),
14128 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14129 Idx);
14130 }
14131 }
14132 }
14133
14134 if (!ExtMask.empty()) {
14135 if (CommonMask.empty()) {
14136 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14137 } else {
14138 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14139 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14140 if (ExtMask[I] == PoisonMaskElem)
14141 continue;
14142 NewMask[I] = CommonMask[ExtMask[I]];
14143 }
14144 CommonMask.swap(NewMask);
14145 }
14146 }
14147 if (CommonMask.empty()) {
14148 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14149 return Cost;
14150 }
14151 return Cost +
14152 createShuffle(InVectors.front(),
14153 InVectors.size() == 2 ? InVectors.back() : nullptr,
14154 CommonMask);
14155 }
14156
14158 assert((IsFinalized || CommonMask.empty()) &&
14159 "Shuffle construction must be finalized.");
14160 }
14161};
14162
14163const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14164 unsigned Idx) const {
14165 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14166 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14167 return Op;
14168}
14169
14170TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14171 if (TE.State == TreeEntry::ScatterVectorize ||
14172 TE.State == TreeEntry::StridedVectorize)
14174 if (TE.State == TreeEntry::CompressVectorize)
14176 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14177 !TE.isAltShuffle()) {
14178 if (TE.ReorderIndices.empty())
14180 SmallVector<int> Mask;
14181 inversePermutation(TE.ReorderIndices, Mask);
14182 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14184 }
14186}
14187
14189BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14190 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14191 ArrayRef<Value *> VL = E->Scalars;
14192
14193 Type *ScalarTy = getValueType(VL[0]);
14194 if (!isValidElementType(ScalarTy))
14197
14198 // If we have computed a smaller type for the expression, update VecTy so
14199 // that the costs will be accurate.
14200 auto It = MinBWs.find(E);
14201 Type *OrigScalarTy = ScalarTy;
14202 if (It != MinBWs.end()) {
14203 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14204 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14205 if (VecTy)
14206 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14207 }
14208 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14209 unsigned EntryVF = E->getVectorFactor();
14210 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14211
14212 if (E->isGather()) {
14213 if (allConstant(VL))
14214 return 0;
14215 if (isa<InsertElementInst>(VL[0]))
14217 if (isa<CmpInst>(VL.front()))
14218 ScalarTy = VL.front()->getType();
14219 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14220 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14221 }
14222 if (E->State == TreeEntry::SplitVectorize) {
14223 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14224 "Expected exactly 2 combined entries.");
14225 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14226 InstructionCost VectorCost = 0;
14227 if (E->ReorderIndices.empty()) {
14228 VectorCost = ::getShuffleCost(
14229 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14230 E->CombinedEntriesWithIndices.back().second,
14232 ScalarTy,
14233 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14234 ->getVectorFactor()));
14235 } else {
14236 unsigned CommonVF =
14237 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14238 ->getVectorFactor(),
14239 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14240 ->getVectorFactor());
14241 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14242 getWidenedType(ScalarTy, CommonVF),
14243 E->getSplitMask(), CostKind);
14244 }
14245 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14246 return VectorCost;
14247 }
14248 InstructionCost CommonCost = 0;
14249 SmallVector<int> Mask;
14250 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14251 (E->State != TreeEntry::StridedVectorize ||
14252 !isReverseOrder(E->ReorderIndices))) {
14253 SmallVector<int> NewMask;
14254 if (E->getOpcode() == Instruction::Store) {
14255 // For stores the order is actually a mask.
14256 NewMask.resize(E->ReorderIndices.size());
14257 copy(E->ReorderIndices, NewMask.begin());
14258 } else {
14259 inversePermutation(E->ReorderIndices, NewMask);
14260 }
14261 ::addMask(Mask, NewMask);
14262 }
14263 if (!E->ReuseShuffleIndices.empty())
14264 ::addMask(Mask, E->ReuseShuffleIndices);
14265 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14266 CommonCost =
14267 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14268 assert((E->State == TreeEntry::Vectorize ||
14269 E->State == TreeEntry::ScatterVectorize ||
14270 E->State == TreeEntry::StridedVectorize ||
14271 E->State == TreeEntry::CompressVectorize) &&
14272 "Unhandled state");
14273 assert(E->getOpcode() &&
14274 ((allSameType(VL) && allSameBlock(VL)) ||
14275 (E->getOpcode() == Instruction::GetElementPtr &&
14276 E->getMainOp()->getType()->isPointerTy()) ||
14277 E->hasCopyableElements()) &&
14278 "Invalid VL");
14279 Instruction *VL0 = E->getMainOp();
14280 unsigned ShuffleOrOp =
14281 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14282 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14283 ShuffleOrOp = E->CombinedOp;
14284 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
14285 const unsigned Sz = UniqueValues.size();
14286 SmallBitVector UsedScalars(Sz, false);
14287 for (unsigned I = 0; I < Sz; ++I) {
14288 if (isa<Instruction>(UniqueValues[I]) &&
14289 !E->isCopyableElement(UniqueValues[I]) &&
14290 getTreeEntries(UniqueValues[I]).front() == E)
14291 continue;
14292 UsedScalars.set(I);
14293 }
14294 auto GetCastContextHint = [&](Value *V) {
14295 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14296 return getCastContextHint(*OpTEs.front());
14297 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14298 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14299 !SrcState.isAltShuffle())
14302 };
14303 auto GetCostDiff =
14304 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14305 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14306 // Calculate the cost of this instruction.
14307 InstructionCost ScalarCost = 0;
14308 if (isa<CastInst, CallInst>(VL0)) {
14309 // For some of the instructions no need to calculate cost for each
14310 // particular instruction, we can use the cost of the single
14311 // instruction x total number of scalar instructions.
14312 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14313 } else {
14314 for (unsigned I = 0; I < Sz; ++I) {
14315 if (UsedScalars.test(I))
14316 continue;
14317 ScalarCost += ScalarEltCost(I);
14318 }
14319 }
14320
14321 InstructionCost VecCost = VectorCost(CommonCost);
14322 // Check if the current node must be resized, if the parent node is not
14323 // resized.
14324 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14325 E->Idx != 0 &&
14326 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14327 const EdgeInfo &EI = E->UserTreeIndex;
14328 if (!EI.UserTE->hasState() ||
14329 EI.UserTE->getOpcode() != Instruction::Select ||
14330 EI.EdgeIdx != 0) {
14331 auto UserBWIt = MinBWs.find(EI.UserTE);
14332 Type *UserScalarTy =
14333 (EI.UserTE->isGather() ||
14334 EI.UserTE->State == TreeEntry::SplitVectorize)
14335 ? EI.UserTE->Scalars.front()->getType()
14336 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14337 if (UserBWIt != MinBWs.end())
14338 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14339 UserBWIt->second.first);
14340 if (ScalarTy != UserScalarTy) {
14341 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14342 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14343 unsigned VecOpcode;
14344 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14345 if (BWSz > SrcBWSz)
14346 VecOpcode = Instruction::Trunc;
14347 else
14348 VecOpcode =
14349 It->second.second ? Instruction::SExt : Instruction::ZExt;
14350 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14351 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14352 CostKind);
14353 }
14354 }
14355 }
14356 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14357 ScalarCost, "Calculated costs for Tree"));
14358 return VecCost - ScalarCost;
14359 };
14360 // Calculate cost difference from vectorizing set of GEPs.
14361 // Negative value means vectorizing is profitable.
14362 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14363 assert((E->State == TreeEntry::Vectorize ||
14364 E->State == TreeEntry::StridedVectorize ||
14365 E->State == TreeEntry::CompressVectorize) &&
14366 "Entry state expected to be Vectorize, StridedVectorize or "
14367 "MaskedLoadCompressVectorize here.");
14368 InstructionCost ScalarCost = 0;
14369 InstructionCost VecCost = 0;
14370 std::tie(ScalarCost, VecCost) = getGEPCosts(
14371 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14372 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14373 "Calculated GEPs cost for Tree"));
14374
14375 return VecCost - ScalarCost;
14376 };
14377
14378 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14379 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14380 if (MinMaxID == Intrinsic::not_intrinsic)
14382 Type *CanonicalType = Ty;
14383 if (CanonicalType->isPtrOrPtrVectorTy())
14384 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14385 CanonicalType->getContext(),
14386 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14387
14388 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14389 {CanonicalType, CanonicalType});
14391 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14392 // If the selects are the only uses of the compares, they will be
14393 // dead and we can adjust the cost by removing their cost.
14394 if (VI && SelectOnly) {
14395 assert((!Ty->isVectorTy() || SLPReVec) &&
14396 "Expected only for scalar type.");
14397 auto *CI = cast<CmpInst>(VI->getOperand(0));
14398 IntrinsicCost -= TTI->getCmpSelInstrCost(
14399 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14400 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14401 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14402 }
14403 return IntrinsicCost;
14404 };
14405 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14406 Instruction *VI) {
14407 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14408 return Cost;
14409 };
14410 switch (ShuffleOrOp) {
14411 case Instruction::PHI: {
14412 // Count reused scalars.
14413 InstructionCost ScalarCost = 0;
14414 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14415 for (Value *V : UniqueValues) {
14416 auto *PHI = dyn_cast<PHINode>(V);
14417 if (!PHI)
14418 continue;
14419
14420 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14421 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14422 Value *Op = PHI->getIncomingValue(I);
14423 Operands[I] = Op;
14424 }
14425 if (const TreeEntry *OpTE =
14426 getSameValuesTreeEntry(Operands.front(), Operands))
14427 if (CountedOps.insert(OpTE).second &&
14428 !OpTE->ReuseShuffleIndices.empty())
14429 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14430 OpTE->Scalars.size());
14431 }
14432
14433 return CommonCost - ScalarCost;
14434 }
14435 case Instruction::ExtractValue:
14436 case Instruction::ExtractElement: {
14437 APInt DemandedElts;
14438 VectorType *SrcVecTy = nullptr;
14439 auto GetScalarCost = [&](unsigned Idx) {
14440 if (isa<PoisonValue>(UniqueValues[Idx]))
14442
14443 auto *I = cast<Instruction>(UniqueValues[Idx]);
14444 if (!SrcVecTy) {
14445 if (ShuffleOrOp == Instruction::ExtractElement) {
14446 auto *EE = cast<ExtractElementInst>(I);
14447 SrcVecTy = EE->getVectorOperandType();
14448 } else {
14449 auto *EV = cast<ExtractValueInst>(I);
14450 Type *AggregateTy = EV->getAggregateOperand()->getType();
14451 unsigned NumElts;
14452 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14453 NumElts = ATy->getNumElements();
14454 else
14455 NumElts = AggregateTy->getStructNumElements();
14456 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
14457 }
14458 }
14459 if (I->hasOneUse()) {
14460 Instruction *Ext = I->user_back();
14461 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14463 // Use getExtractWithExtendCost() to calculate the cost of
14464 // extractelement/ext pair.
14465 InstructionCost Cost = TTI->getExtractWithExtendCost(
14466 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
14467 CostKind);
14468 // Subtract the cost of s|zext which is subtracted separately.
14469 Cost -= TTI->getCastInstrCost(
14470 Ext->getOpcode(), Ext->getType(), I->getType(),
14472 return Cost;
14473 }
14474 }
14475 if (DemandedElts.isZero())
14476 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
14477 DemandedElts.setBit(*getExtractIndex(I));
14479 };
14480 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14481 return CommonCost - (DemandedElts.isZero()
14483 : TTI.getScalarizationOverhead(
14484 SrcVecTy, DemandedElts, /*Insert=*/false,
14485 /*Extract=*/true, CostKind));
14486 };
14487 return GetCostDiff(GetScalarCost, GetVectorCost);
14488 }
14489 case Instruction::InsertElement: {
14490 assert(E->ReuseShuffleIndices.empty() &&
14491 "Unique insertelements only are expected.");
14492 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
14493 unsigned const NumElts = SrcVecTy->getNumElements();
14494 unsigned const NumScalars = VL.size();
14495
14496 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
14497
14498 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14499 unsigned OffsetBeg = *getElementIndex(VL.front());
14500 unsigned OffsetEnd = OffsetBeg;
14501 InsertMask[OffsetBeg] = 0;
14502 for (auto [I, V] : enumerate(VL.drop_front())) {
14503 unsigned Idx = *getElementIndex(V);
14504 if (OffsetBeg > Idx)
14505 OffsetBeg = Idx;
14506 else if (OffsetEnd < Idx)
14507 OffsetEnd = Idx;
14508 InsertMask[Idx] = I + 1;
14509 }
14510 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
14511 if (NumOfParts > 0 && NumOfParts < NumElts)
14512 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14513 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14514 VecScalarsSz;
14515 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14516 unsigned InsertVecSz = std::min<unsigned>(
14517 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
14518 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14519 bool IsWholeSubvector =
14520 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14521 // Check if we can safely insert a subvector. If it is not possible, just
14522 // generate a whole-sized vector and shuffle the source vector and the new
14523 // subvector.
14524 if (OffsetBeg + InsertVecSz > VecSz) {
14525 // Align OffsetBeg to generate correct mask.
14526 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
14527 InsertVecSz = VecSz;
14528 }
14529
14530 APInt DemandedElts = APInt::getZero(NumElts);
14531 // TODO: Add support for Instruction::InsertValue.
14532 SmallVector<int> Mask;
14533 if (!E->ReorderIndices.empty()) {
14534 inversePermutation(E->ReorderIndices, Mask);
14535 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
14536 } else {
14537 Mask.assign(VecSz, PoisonMaskElem);
14538 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
14539 }
14540 bool IsIdentity = true;
14541 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
14542 Mask.swap(PrevMask);
14543 for (unsigned I = 0; I < NumScalars; ++I) {
14544 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
14545 DemandedElts.setBit(InsertIdx);
14546 IsIdentity &= InsertIdx - OffsetBeg == I;
14547 Mask[InsertIdx - OffsetBeg] = I;
14548 }
14549 assert(Offset < NumElts && "Failed to find vector index offset");
14550
14552 Cost -=
14553 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
14554 /*Insert*/ true, /*Extract*/ false, CostKind);
14555
14556 // First cost - resize to actual vector size if not identity shuffle or
14557 // need to shift the vector.
14558 // Do not calculate the cost if the actual size is the register size and
14559 // we can merge this shuffle with the following SK_Select.
14560 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
14561 if (!IsIdentity)
14563 InsertVecTy, Mask);
14564 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14565 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14566 }));
14567 // Second cost - permutation with subvector, if some elements are from the
14568 // initial vector or inserting a subvector.
14569 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
14570 // subvector of ActualVecTy.
14571 SmallBitVector InMask =
14572 isUndefVector(FirstInsert->getOperand(0),
14573 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14574 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
14575 if (InsertVecSz != VecSz) {
14576 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
14577 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
14578 CostKind, OffsetBeg - Offset, InsertVecTy);
14579 } else {
14580 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
14581 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
14582 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
14583 I <= End; ++I)
14584 if (Mask[I] != PoisonMaskElem)
14585 Mask[I] = I + VecSz;
14586 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
14587 Mask[I] =
14588 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
14589 Cost +=
14590 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
14591 }
14592 }
14593 return Cost;
14594 }
14595 case Instruction::ZExt:
14596 case Instruction::SExt:
14597 case Instruction::FPToUI:
14598 case Instruction::FPToSI:
14599 case Instruction::FPExt:
14600 case Instruction::PtrToInt:
14601 case Instruction::IntToPtr:
14602 case Instruction::SIToFP:
14603 case Instruction::UIToFP:
14604 case Instruction::Trunc:
14605 case Instruction::FPTrunc:
14606 case Instruction::BitCast: {
14607 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14608 Type *SrcScalarTy = VL0->getOperand(0)->getType();
14609 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
14610 unsigned Opcode = ShuffleOrOp;
14611 unsigned VecOpcode = Opcode;
14612 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14613 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14614 // Check if the values are candidates to demote.
14615 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
14616 if (SrcIt != MinBWs.end()) {
14617 SrcBWSz = SrcIt->second.first;
14618 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
14619 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
14620 SrcVecTy =
14621 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
14622 }
14623 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14624 if (BWSz == SrcBWSz) {
14625 VecOpcode = Instruction::BitCast;
14626 } else if (BWSz < SrcBWSz) {
14627 VecOpcode = Instruction::Trunc;
14628 } else if (It != MinBWs.end()) {
14629 assert(BWSz > SrcBWSz && "Invalid cast!");
14630 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14631 } else if (SrcIt != MinBWs.end()) {
14632 assert(BWSz > SrcBWSz && "Invalid cast!");
14633 VecOpcode =
14634 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14635 }
14636 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14637 !SrcIt->second.second) {
14638 VecOpcode = Instruction::UIToFP;
14639 }
14640 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
14641 assert(Idx == 0 && "Expected 0 index only");
14642 return TTI->getCastInstrCost(Opcode, VL0->getType(),
14643 VL0->getOperand(0)->getType(),
14645 };
14646 auto GetVectorCost = [=](InstructionCost CommonCost) {
14647 // Do not count cost here if minimum bitwidth is in effect and it is just
14648 // a bitcast (here it is just a noop).
14649 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14650 return CommonCost;
14651 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
14652 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
14653
14654 bool IsArithmeticExtendedReduction =
14655 E->Idx == 0 && UserIgnoreList &&
14656 all_of(*UserIgnoreList, [](Value *V) {
14657 auto *I = cast<Instruction>(V);
14658 return is_contained({Instruction::Add, Instruction::FAdd,
14659 Instruction::Mul, Instruction::FMul,
14660 Instruction::And, Instruction::Or,
14661 Instruction::Xor},
14662 I->getOpcode());
14663 });
14664 if (IsArithmeticExtendedReduction &&
14665 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14666 return CommonCost;
14667 return CommonCost +
14668 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
14669 VecOpcode == Opcode ? VI : nullptr);
14670 };
14671 return GetCostDiff(GetScalarCost, GetVectorCost);
14672 }
14673 case Instruction::FCmp:
14674 case Instruction::ICmp:
14675 case Instruction::Select: {
14676 CmpPredicate VecPred, SwappedVecPred;
14677 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
14678 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
14679 match(VL0, MatchCmp))
14680 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
14681 else
14682 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
14685 auto GetScalarCost = [&](unsigned Idx) {
14686 if (isa<PoisonValue>(UniqueValues[Idx]))
14688
14689 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14690 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
14693 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
14694 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
14695 !match(VI, MatchCmp)) ||
14696 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
14697 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
14698 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
14701
14702 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
14703 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14704 CostKind, getOperandInfo(VI->getOperand(0)),
14705 getOperandInfo(VI->getOperand(1)), VI);
14706 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
14707 if (IntrinsicCost.isValid())
14708 ScalarCost = IntrinsicCost;
14709
14710 return ScalarCost;
14711 };
14712 auto GetVectorCost = [&](InstructionCost CommonCost) {
14713 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
14714
14715 InstructionCost VecCost =
14716 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
14717 CostKind, getOperandInfo(E->getOperand(0)),
14718 getOperandInfo(E->getOperand(1)), VL0);
14719 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
14720 auto *CondType =
14721 getWidenedType(SI->getCondition()->getType(), VL.size());
14722 unsigned CondNumElements = CondType->getNumElements();
14723 unsigned VecTyNumElements = getNumElements(VecTy);
14724 assert(VecTyNumElements >= CondNumElements &&
14725 VecTyNumElements % CondNumElements == 0 &&
14726 "Cannot vectorize Instruction::Select");
14727 if (CondNumElements != VecTyNumElements) {
14728 // When the return type is i1 but the source is fixed vector type, we
14729 // need to duplicate the condition value.
14730 VecCost += ::getShuffleCost(
14731 *TTI, TTI::SK_PermuteSingleSrc, CondType,
14732 createReplicatedMask(VecTyNumElements / CondNumElements,
14733 CondNumElements));
14734 }
14735 }
14736 return VecCost + CommonCost;
14737 };
14738 return GetCostDiff(GetScalarCost, GetVectorCost);
14739 }
14740 case TreeEntry::MinMax: {
14741 auto GetScalarCost = [&](unsigned Idx) {
14742 return GetMinMaxCost(OrigScalarTy);
14743 };
14744 auto GetVectorCost = [&](InstructionCost CommonCost) {
14745 InstructionCost VecCost = GetMinMaxCost(VecTy);
14746 return VecCost + CommonCost;
14747 };
14748 return GetCostDiff(GetScalarCost, GetVectorCost);
14749 }
14750 case TreeEntry::FMulAdd: {
14751 auto GetScalarCost = [&](unsigned Idx) {
14752 if (isa<PoisonValue>(UniqueValues[Idx]))
14754 return GetFMulAddCost(E->getOperations(),
14755 cast<Instruction>(UniqueValues[Idx]));
14756 };
14757 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14758 FastMathFlags FMF;
14759 FMF.set();
14760 for (Value *V : E->Scalars) {
14761 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
14762 FMF &= FPCI->getFastMathFlags();
14763 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
14764 FMF &= FPCIOp->getFastMathFlags();
14765 }
14766 }
14767 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14768 {VecTy, VecTy, VecTy}, FMF);
14769 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
14770 return VecCost + CommonCost;
14771 };
14772 return GetCostDiff(GetScalarCost, GetVectorCost);
14773 }
14774 case Instruction::FNeg:
14775 case Instruction::Add:
14776 case Instruction::FAdd:
14777 case Instruction::Sub:
14778 case Instruction::FSub:
14779 case Instruction::Mul:
14780 case Instruction::FMul:
14781 case Instruction::UDiv:
14782 case Instruction::SDiv:
14783 case Instruction::FDiv:
14784 case Instruction::URem:
14785 case Instruction::SRem:
14786 case Instruction::FRem:
14787 case Instruction::Shl:
14788 case Instruction::LShr:
14789 case Instruction::AShr:
14790 case Instruction::And:
14791 case Instruction::Or:
14792 case Instruction::Xor: {
14793 auto GetScalarCost = [&](unsigned Idx) {
14794 if (isa<PoisonValue>(UniqueValues[Idx]))
14796
14797 // We cannot retrieve the operand from UniqueValues[Idx] because an
14798 // interchangeable instruction may be used. The order and the actual
14799 // operand might differ from what is retrieved from UniqueValues[Idx].
14800 Value *Op1 = E->getOperand(0)[Idx];
14801 Value *Op2;
14802 SmallVector<const Value *, 2> Operands(1, Op1);
14803 if (isa<UnaryOperator>(UniqueValues[Idx])) {
14804 Op2 = Op1;
14805 } else {
14806 Op2 = E->getOperand(1)[Idx];
14807 Operands.push_back(Op2);
14808 }
14811 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
14812 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
14813 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
14814 I && (ShuffleOrOp == Instruction::FAdd ||
14815 ShuffleOrOp == Instruction::FSub)) {
14816 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
14817 if (IntrinsicCost.isValid())
14818 ScalarCost = IntrinsicCost;
14819 }
14820 return ScalarCost;
14821 };
14822 auto GetVectorCost = [=](InstructionCost CommonCost) {
14823 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14824 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
14825 ArrayRef<Value *> Ops = E->getOperand(I);
14826 if (all_of(Ops, [&](Value *Op) {
14827 auto *CI = dyn_cast<ConstantInt>(Op);
14828 return CI && CI->getValue().countr_one() >= It->second.first;
14829 }))
14830 return CommonCost;
14831 }
14832 }
14833 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
14834 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
14835 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
14836 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
14837 Op2Info, {}, nullptr, TLI) +
14838 CommonCost;
14839 };
14840 return GetCostDiff(GetScalarCost, GetVectorCost);
14841 }
14842 case Instruction::GetElementPtr: {
14843 return CommonCost + GetGEPCostDiff(VL, VL0);
14844 }
14845 case Instruction::Load: {
14846 auto GetScalarCost = [&](unsigned Idx) {
14847 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
14848 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14849 VI->getAlign(), VI->getPointerAddressSpace(),
14851 };
14852 auto *LI0 = cast<LoadInst>(VL0);
14853 auto GetVectorCost = [&](InstructionCost CommonCost) {
14854 InstructionCost VecLdCost;
14855 switch (E->State) {
14856 case TreeEntry::Vectorize:
14857 if (unsigned Factor = E->getInterleaveFactor()) {
14858 VecLdCost = TTI->getInterleavedMemoryOpCost(
14859 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14860 LI0->getPointerAddressSpace(), CostKind);
14861
14862 } else {
14863 VecLdCost = TTI->getMemoryOpCost(
14864 Instruction::Load, VecTy, LI0->getAlign(),
14865 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14866 }
14867 break;
14868 case TreeEntry::StridedVectorize: {
14869 Align CommonAlignment =
14870 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14871 VecLdCost = TTI->getStridedMemoryOpCost(
14872 Instruction::Load, VecTy, LI0->getPointerOperand(),
14873 /*VariableMask=*/false, CommonAlignment, CostKind);
14874 break;
14875 }
14876 case TreeEntry::CompressVectorize: {
14877 bool IsMasked;
14878 unsigned InterleaveFactor;
14879 SmallVector<int> CompressMask;
14880 VectorType *LoadVecTy;
14881 SmallVector<Value *> Scalars(VL);
14882 if (!E->ReorderIndices.empty()) {
14883 SmallVector<int> Mask(E->ReorderIndices.begin(),
14884 E->ReorderIndices.end());
14885 reorderScalars(Scalars, Mask);
14886 }
14887 SmallVector<Value *> PointerOps(Scalars.size());
14888 for (auto [I, V] : enumerate(Scalars))
14889 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
14890 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
14891 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
14892 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
14893 CompressMask, LoadVecTy);
14894 assert(IsVectorized && "Failed to vectorize load");
14895 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
14896 InterleaveFactor, IsMasked);
14897 Align CommonAlignment = LI0->getAlign();
14898 if (InterleaveFactor) {
14899 VecLdCost = TTI->getInterleavedMemoryOpCost(
14900 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14901 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
14902 } else if (IsMasked) {
14903 VecLdCost = TTI->getMaskedMemoryOpCost(
14904 Instruction::Load, LoadVecTy, CommonAlignment,
14905 LI0->getPointerAddressSpace(), CostKind);
14906 // TODO: include this cost into CommonCost.
14907 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
14908 LoadVecTy, CompressMask, CostKind);
14909 } else {
14910 VecLdCost = TTI->getMemoryOpCost(
14911 Instruction::Load, LoadVecTy, CommonAlignment,
14912 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14913 // TODO: include this cost into CommonCost.
14914 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
14915 LoadVecTy, CompressMask, CostKind);
14916 }
14917 break;
14918 }
14919 case TreeEntry::ScatterVectorize: {
14920 Align CommonAlignment =
14921 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14922 VecLdCost = TTI->getGatherScatterOpCost(
14923 Instruction::Load, VecTy, LI0->getPointerOperand(),
14924 /*VariableMask=*/false, CommonAlignment, CostKind);
14925 break;
14926 }
14927 case TreeEntry::CombinedVectorize:
14928 case TreeEntry::SplitVectorize:
14929 case TreeEntry::NeedToGather:
14930 llvm_unreachable("Unexpected vectorization state.");
14931 }
14932 return VecLdCost + CommonCost;
14933 };
14934
14935 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
14936 // If this node generates masked gather load then it is not a terminal node.
14937 // Hence address operand cost is estimated separately.
14938 if (E->State == TreeEntry::ScatterVectorize)
14939 return Cost;
14940
14941 // Estimate cost of GEPs since this tree node is a terminator.
14942 SmallVector<Value *> PointerOps(VL.size());
14943 for (auto [I, V] : enumerate(VL))
14944 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
14945 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14946 }
14947 case Instruction::Store: {
14948 bool IsReorder = !E->ReorderIndices.empty();
14949 auto GetScalarCost = [=](unsigned Idx) {
14950 auto *VI = cast<StoreInst>(VL[Idx]);
14951 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
14952 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
14953 VI->getAlign(), VI->getPointerAddressSpace(),
14954 CostKind, OpInfo, VI);
14955 };
14956 auto *BaseSI =
14957 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
14958 auto GetVectorCost = [=](InstructionCost CommonCost) {
14959 // We know that we can merge the stores. Calculate the cost.
14960 InstructionCost VecStCost;
14961 if (E->State == TreeEntry::StridedVectorize) {
14962 Align CommonAlignment =
14963 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
14964 VecStCost = TTI->getStridedMemoryOpCost(
14965 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
14966 /*VariableMask=*/false, CommonAlignment, CostKind);
14967 } else {
14968 assert(E->State == TreeEntry::Vectorize &&
14969 "Expected either strided or consecutive stores.");
14970 if (unsigned Factor = E->getInterleaveFactor()) {
14971 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
14972 "No reused shuffles expected");
14973 CommonCost = 0;
14974 VecStCost = TTI->getInterleavedMemoryOpCost(
14975 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
14976 BaseSI->getPointerAddressSpace(), CostKind);
14977 } else {
14978 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
14979 VecStCost = TTI->getMemoryOpCost(
14980 Instruction::Store, VecTy, BaseSI->getAlign(),
14981 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
14982 }
14983 }
14984 return VecStCost + CommonCost;
14985 };
14986 SmallVector<Value *> PointerOps(VL.size());
14987 for (auto [I, V] : enumerate(VL)) {
14988 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
14989 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
14990 }
14991
14992 return GetCostDiff(GetScalarCost, GetVectorCost) +
14993 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
14994 }
14995 case Instruction::Call: {
14996 auto GetScalarCost = [&](unsigned Idx) {
14997 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15000 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15001 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15002 }
15003 return TTI->getCallInstrCost(CI->getCalledFunction(),
15005 CI->getFunctionType()->params(), CostKind);
15006 };
15007 auto GetVectorCost = [=](InstructionCost CommonCost) {
15008 auto *CI = cast<CallInst>(VL0);
15011 CI, ID, VecTy->getNumElements(),
15012 It != MinBWs.end() ? It->second.first : 0, TTI);
15013 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15014 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15015 };
15016 return GetCostDiff(GetScalarCost, GetVectorCost);
15017 }
15018 case Instruction::ShuffleVector: {
15019 if (!SLPReVec || E->isAltShuffle())
15020 assert(E->isAltShuffle() &&
15021 ((Instruction::isBinaryOp(E->getOpcode()) &&
15022 Instruction::isBinaryOp(E->getAltOpcode())) ||
15023 (Instruction::isCast(E->getOpcode()) &&
15024 Instruction::isCast(E->getAltOpcode())) ||
15025 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15026 "Invalid Shuffle Vector Operand");
15027 // Try to find the previous shuffle node with the same operands and same
15028 // main/alternate ops.
15029 auto TryFindNodeWithEqualOperands = [=]() {
15030 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15031 if (TE.get() == E)
15032 break;
15033 if (TE->hasState() && TE->isAltShuffle() &&
15034 ((TE->getOpcode() == E->getOpcode() &&
15035 TE->getAltOpcode() == E->getAltOpcode()) ||
15036 (TE->getOpcode() == E->getAltOpcode() &&
15037 TE->getAltOpcode() == E->getOpcode())) &&
15038 TE->hasEqualOperands(*E))
15039 return true;
15040 }
15041 return false;
15042 };
15043 auto GetScalarCost = [&](unsigned Idx) {
15044 if (isa<PoisonValue>(UniqueValues[Idx]))
15046
15047 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15048 assert(E->getMatchingMainOpOrAltOp(VI) &&
15049 "Unexpected main/alternate opcode");
15050 (void)E;
15051 return TTI->getInstructionCost(VI, CostKind);
15052 };
15053 // Need to clear CommonCost since the final shuffle cost is included into
15054 // vector cost.
15055 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15056 // VecCost is equal to sum of the cost of creating 2 vectors
15057 // and the cost of creating shuffle.
15058 InstructionCost VecCost = 0;
15059 if (TryFindNodeWithEqualOperands()) {
15060 LLVM_DEBUG({
15061 dbgs() << "SLP: diamond match for alternate node found.\n";
15062 E->dump();
15063 });
15064 // No need to add new vector costs here since we're going to reuse
15065 // same main/alternate vector ops, just do different shuffling.
15066 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15067 VecCost =
15068 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15069 VecCost +=
15070 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15071 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15072 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15073 VecCost = TTIRef.getCmpSelInstrCost(
15074 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15075 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15076 VL0);
15077 VecCost += TTIRef.getCmpSelInstrCost(
15078 E->getOpcode(), VecTy, MaskTy,
15079 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15080 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15081 E->getAltOp());
15082 } else {
15083 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15084 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15085 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15086 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15087 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15088 unsigned SrcBWSz =
15089 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15090 if (SrcIt != MinBWs.end()) {
15091 SrcBWSz = SrcIt->second.first;
15092 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15093 SrcTy = getWidenedType(SrcSclTy, VL.size());
15094 }
15095 if (BWSz <= SrcBWSz) {
15096 if (BWSz < SrcBWSz)
15097 VecCost =
15098 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15100 LLVM_DEBUG({
15101 dbgs()
15102 << "SLP: alternate extension, which should be truncated.\n";
15103 E->dump();
15104 });
15105 return VecCost;
15106 }
15107 }
15108 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15110 VecCost +=
15111 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15113 }
15114 SmallVector<int> Mask;
15115 E->buildAltOpShuffleMask(
15116 [&](Instruction *I) {
15117 assert(E->getMatchingMainOpOrAltOp(I) &&
15118 "Unexpected main/alternate opcode");
15119 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15120 *TLI);
15121 },
15122 Mask);
15124 FinalVecTy, Mask, CostKind);
15125 // Patterns like [fadd,fsub] can be combined into a single instruction
15126 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15127 // need to take into account their order when looking for the most used
15128 // order.
15129 unsigned Opcode0 = E->getOpcode();
15130 unsigned Opcode1 = E->getAltOpcode();
15131 SmallBitVector OpcodeMask(
15132 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15133 // If this pattern is supported by the target then we consider the
15134 // order.
15135 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15136 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15137 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15138 return AltVecCost < VecCost ? AltVecCost : VecCost;
15139 }
15140 // TODO: Check the reverse order too.
15141 return VecCost;
15142 };
15143 if (SLPReVec && !E->isAltShuffle())
15144 return GetCostDiff(
15145 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15146 // If a group uses mask in order, the shufflevector can be
15147 // eliminated by instcombine. Then the cost is 0.
15149 "Not supported shufflevector usage.");
15150 auto *SV = cast<ShuffleVectorInst>(VL.front());
15151 unsigned SVNumElements =
15152 cast<FixedVectorType>(SV->getOperand(0)->getType())
15153 ->getNumElements();
15154 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15155 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15156 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15157 int NextIndex = 0;
15158 if (!all_of(Group, [&](Value *V) {
15160 "Not supported shufflevector usage.");
15161 auto *SV = cast<ShuffleVectorInst>(V);
15162 int Index;
15163 [[maybe_unused]] bool IsExtractSubvectorMask =
15164 SV->isExtractSubvectorMask(Index);
15165 assert(IsExtractSubvectorMask &&
15166 "Not supported shufflevector usage.");
15167 if (NextIndex != Index)
15168 return false;
15169 NextIndex += SV->getShuffleMask().size();
15170 return true;
15171 }))
15172 return ::getShuffleCost(
15174 calculateShufflevectorMask(E->Scalars));
15175 }
15176 return TTI::TCC_Free;
15177 });
15178 return GetCostDiff(GetScalarCost, GetVectorCost);
15179 }
15180 case Instruction::Freeze:
15181 return CommonCost;
15182 default:
15183 llvm_unreachable("Unknown instruction");
15184 }
15185}
15186
15187bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15188 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15189 << VectorizableTree.size() << " is fully vectorizable .\n");
15190
15191 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15192 SmallVector<int> Mask;
15193 return TE->isGather() &&
15194 !any_of(TE->Scalars,
15195 [this](Value *V) { return EphValues.contains(V); }) &&
15196 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15197 TE->Scalars.size() < Limit ||
15198 (((TE->hasState() &&
15199 TE->getOpcode() == Instruction::ExtractElement) ||
15201 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15202 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15203 !TE->isAltShuffle()) ||
15204 any_of(TE->Scalars, IsaPred<LoadInst>));
15205 };
15206
15207 // We only handle trees of heights 1 and 2.
15208 if (VectorizableTree.size() == 1 &&
15209 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15210 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15211 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15212 (ForReduction &&
15213 AreVectorizableGathers(VectorizableTree[0].get(),
15214 VectorizableTree[0]->Scalars.size()) &&
15215 VectorizableTree[0]->getVectorFactor() > 2)))
15216 return true;
15217
15218 if (VectorizableTree.size() != 2)
15219 return false;
15220
15221 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15222 // with the second gather nodes if they have less scalar operands rather than
15223 // the initial tree element (may be profitable to shuffle the second gather)
15224 // or they are extractelements, which form shuffle.
15225 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15226 AreVectorizableGathers(VectorizableTree[1].get(),
15227 VectorizableTree[0]->Scalars.size()))
15228 return true;
15229
15230 // Gathering cost would be too much for tiny trees.
15231 if (VectorizableTree[0]->isGather() ||
15232 (VectorizableTree[1]->isGather() &&
15233 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15234 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15235 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15236 return false;
15237
15238 return true;
15239}
15240
15241static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15243 bool MustMatchOrInst) {
15244 // Look past the root to find a source value. Arbitrarily follow the
15245 // path through operand 0 of any 'or'. Also, peek through optional
15246 // shift-left-by-multiple-of-8-bits.
15247 Value *ZextLoad = Root;
15248 const APInt *ShAmtC;
15249 bool FoundOr = false;
15250 while (!isa<ConstantExpr>(ZextLoad) &&
15251 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15252 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15253 ShAmtC->urem(8) == 0))) {
15254 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15255 ZextLoad = BinOp->getOperand(0);
15256 if (BinOp->getOpcode() == Instruction::Or)
15257 FoundOr = true;
15258 }
15259 // Check if the input is an extended load of the required or/shift expression.
15260 Value *Load;
15261 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15262 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15263 return false;
15264
15265 // Require that the total load bit width is a legal integer type.
15266 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15267 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15268 Type *SrcTy = Load->getType();
15269 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15270 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15271 return false;
15272
15273 // Everything matched - assume that we can fold the whole sequence using
15274 // load combining.
15275 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15276 << *(cast<Instruction>(Root)) << "\n");
15277
15278 return true;
15279}
15280
15282 if (RdxKind != RecurKind::Or)
15283 return false;
15284
15285 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15286 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15287 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15288 /* MatchOr */ false);
15289}
15290
15292 // Peek through a final sequence of stores and check if all operations are
15293 // likely to be load-combined.
15294 unsigned NumElts = Stores.size();
15295 for (Value *Scalar : Stores) {
15296 Value *X;
15297 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15298 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15299 return false;
15300 }
15301 return true;
15302}
15303
15304bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15305 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15306 return true;
15307
15308 // Graph is empty - do nothing.
15309 if (VectorizableTree.empty()) {
15310 assert(ExternalUses.empty() && "We shouldn't have any external users");
15311
15312 return true;
15313 }
15314
15315 // No need to vectorize inserts of gathered values.
15316 if (VectorizableTree.size() == 2 &&
15317 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15318 VectorizableTree[1]->isGather() &&
15319 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15320 !(isSplat(VectorizableTree[1]->Scalars) ||
15321 allConstant(VectorizableTree[1]->Scalars))))
15322 return true;
15323
15324 // If the graph includes only PHI nodes and gathers, it is defnitely not
15325 // profitable for the vectorization, we can skip it, if the cost threshold is
15326 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15327 // gathers/buildvectors.
15328 constexpr int Limit = 4;
15329 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15330 !VectorizableTree.empty() &&
15331 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15332 return (TE->isGather() &&
15333 (!TE->hasState() ||
15334 TE->getOpcode() != Instruction::ExtractElement) &&
15335 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15336 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15337 }))
15338 return true;
15339
15340 // Do not vectorize small tree of phis only, if all vector phis are also
15341 // gathered.
15342 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15343 VectorizableTree.size() <= Limit &&
15344 all_of(VectorizableTree,
15345 [&](const std::unique_ptr<TreeEntry> &TE) {
15346 return (TE->isGather() &&
15347 (!TE->hasState() ||
15348 TE->getOpcode() != Instruction::ExtractElement) &&
15349 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15350 Limit) ||
15351 (TE->hasState() &&
15352 (TE->getOpcode() == Instruction::InsertElement ||
15353 (TE->getOpcode() == Instruction::PHI &&
15354 all_of(TE->Scalars, [&](Value *V) {
15355 return isa<PoisonValue>(V) || MustGather.contains(V);
15356 }))));
15357 }) &&
15358 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15359 return TE->State == TreeEntry::Vectorize &&
15360 TE->getOpcode() == Instruction::PHI;
15361 }))
15362 return true;
15363
15364 // If the tree contains only phis, buildvectors, split nodes and
15365 // small nodes with reuses, we can skip it.
15366 SmallVector<const TreeEntry *> StoreLoadNodes;
15367 unsigned NumGathers = 0;
15368 constexpr int LimitTreeSize = 36;
15369 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15370 all_of(VectorizableTree,
15371 [&](const std::unique_ptr<TreeEntry> &TE) {
15372 if (!TE->isGather() && TE->hasState() &&
15373 (TE->getOpcode() == Instruction::Load ||
15374 TE->getOpcode() == Instruction::Store)) {
15375 StoreLoadNodes.push_back(TE.get());
15376 return true;
15377 }
15378 if (TE->isGather())
15379 ++NumGathers;
15380 return TE->State == TreeEntry::SplitVectorize ||
15381 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15382 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15383 VectorizableTree.size() > LimitTreeSize) ||
15384 (TE->isGather() &&
15385 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15386 (TE->hasState() &&
15387 (TE->getOpcode() == Instruction::PHI ||
15388 (TE->hasCopyableElements() &&
15389 static_cast<unsigned>(count_if(
15390 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15391 TE->Scalars.size() / 2) ||
15392 ((!TE->ReuseShuffleIndices.empty() ||
15393 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15394 TE->Scalars.size() == 2)));
15395 }) &&
15396 (StoreLoadNodes.empty() ||
15397 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15398 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15399 return TE->getOpcode() == Instruction::Store ||
15400 all_of(TE->Scalars, [&](Value *V) {
15401 return !isa<LoadInst>(V) ||
15402 areAllUsersVectorized(cast<Instruction>(V));
15403 });
15404 })))))
15405 return true;
15406
15407 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15408 // tree node) and other buildvectors, we can skip it.
15409 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15410 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15411 VectorizableTree.size() >= Limit &&
15412 count_if(ArrayRef(VectorizableTree).drop_front(),
15413 [&](const std::unique_ptr<TreeEntry> &TE) {
15414 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15415 TE->UserTreeIndex.UserTE->Idx == 0;
15416 }) == 2)
15417 return true;
15418
15419 // If the tree contains only vectorization of the phi node from the
15420 // buildvector - skip it.
15421 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15422 VectorizableTree.size() > 2 &&
15423 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15424 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15425 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15426 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15427 all_of(
15428 ArrayRef(VectorizableTree).drop_front(2),
15429 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
15430 return true;
15431
15432 // We can vectorize the tree if its size is greater than or equal to the
15433 // minimum size specified by the MinTreeSize command line option.
15434 if (VectorizableTree.size() >= MinTreeSize)
15435 return false;
15436
15437 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
15438 // can vectorize it if we can prove it fully vectorizable.
15439 if (isFullyVectorizableTinyTree(ForReduction))
15440 return false;
15441
15442 // Check if any of the gather node forms an insertelement buildvector
15443 // somewhere.
15444 bool IsAllowedSingleBVNode =
15445 VectorizableTree.size() > 1 ||
15446 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15447 !VectorizableTree.front()->isAltShuffle() &&
15448 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15449 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15450 allSameBlock(VectorizableTree.front()->Scalars));
15451 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15452 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
15453 return isa<ExtractElementInst, Constant>(V) ||
15454 (IsAllowedSingleBVNode &&
15455 !V->hasNUsesOrMore(UsesLimit) &&
15456 any_of(V->users(), IsaPred<InsertElementInst>));
15457 });
15458 }))
15459 return false;
15460
15461 if (VectorizableTree.back()->isGather() &&
15462 VectorizableTree.back()->hasState() &&
15463 VectorizableTree.back()->isAltShuffle() &&
15464 VectorizableTree.back()->getVectorFactor() > 2 &&
15465 allSameBlock(VectorizableTree.back()->Scalars) &&
15466 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15467 TTI->getScalarizationOverhead(
15468 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15469 VectorizableTree.back()->getVectorFactor()),
15470 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
15471 /*Insert=*/true, /*Extract=*/false,
15473 return false;
15474
15475 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
15476 // vectorizable.
15477 return true;
15478}
15479
15482 constexpr unsigned SmallTree = 3;
15483 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15484 getCanonicalGraphSize() <= SmallTree &&
15485 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15486 [](const std::unique_ptr<TreeEntry> &TE) {
15487 return TE->isGather() && TE->hasState() &&
15488 TE->getOpcode() == Instruction::Load &&
15489 !allSameBlock(TE->Scalars);
15490 }) == 1)
15491 return true;
15492 return false;
15493 }
15494 bool Res = false;
15495 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
15496 TreeEntry &E = *VectorizableTree[Idx];
15497 if (E.State == TreeEntry::SplitVectorize)
15498 return false;
15499 if (!E.isGather())
15500 continue;
15501 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15502 (!E.hasState() &&
15504 (isa<ExtractElementInst>(E.Scalars.front()) &&
15505 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
15506 return false;
15507 if (isSplat(E.Scalars) || allConstant(E.Scalars))
15508 continue;
15509 Res = true;
15510 }
15511 return Res;
15512}
15513
15515 // Walk from the bottom of the tree to the top, tracking which values are
15516 // live. When we see a call instruction that is not part of our tree,
15517 // query TTI to see if there is a cost to keeping values live over it
15518 // (for example, if spills and fills are required).
15519
15520 const TreeEntry *Root = VectorizableTree.front().get();
15521 if (Root->isGather())
15522 return 0;
15523
15526 EntriesToOperands;
15527 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
15528 SmallPtrSet<const Instruction *, 8> LastInstructions;
15529 for (const auto &TEPtr : VectorizableTree) {
15530 if (!TEPtr->isGather()) {
15531 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15532 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
15533 LastInstructions.insert(LastInst);
15534 }
15535 if (TEPtr->UserTreeIndex)
15536 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15537 }
15538
15539 auto NoCallIntrinsic = [this](const Instruction *I) {
15540 const auto *II = dyn_cast<IntrinsicInst>(I);
15541 if (!II)
15542 return false;
15543 if (II->isAssumeLikeIntrinsic())
15544 return true;
15545 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
15546 InstructionCost IntrCost =
15547 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
15548 InstructionCost CallCost = TTI->getCallInstrCost(
15549 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
15550 return IntrCost < CallCost;
15551 };
15552
15553 // Maps last instruction in the entry to the last instruction for the one of
15554 // operand entries and the flag. If the flag is true, there are no calls in
15555 // between these instructions.
15557 CheckedInstructions;
15558 unsigned Budget = 0;
15559 const unsigned BudgetLimit =
15560 ScheduleRegionSizeBudget / VectorizableTree.size();
15561 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
15562 const Instruction *Last) {
15563 assert(First->getParent() == Last->getParent() &&
15564 "Expected instructions in same block.");
15565 if (auto It = CheckedInstructions.find(Last);
15566 It != CheckedInstructions.end()) {
15567 const Instruction *Checked = It->second.getPointer();
15568 if (Checked == First || Checked->comesBefore(First))
15569 return It->second.getInt() != 0;
15570 Last = Checked;
15571 } else if (Last == First || Last->comesBefore(First)) {
15572 return true;
15573 }
15575 ++First->getIterator().getReverse(),
15576 PrevInstIt =
15577 Last->getIterator().getReverse();
15578 SmallVector<const Instruction *> LastInstsInRange;
15579 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15580 // Debug information does not impact spill cost.
15581 // Vectorized calls, represented as vector intrinsics, do not impact spill
15582 // cost.
15583 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15584 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
15585 for (const Instruction *LastInst : LastInstsInRange)
15586 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
15587 return false;
15588 }
15589 if (LastInstructions.contains(&*PrevInstIt))
15590 LastInstsInRange.push_back(&*PrevInstIt);
15591
15592 ++PrevInstIt;
15593 ++Budget;
15594 }
15595 for (const Instruction *LastInst : LastInstsInRange)
15596 CheckedInstructions.try_emplace(
15597 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
15598 Budget <= BudgetLimit ? 1 : 0);
15599 return Budget <= BudgetLimit;
15600 };
15601 auto AddCosts = [&](const TreeEntry *Op) {
15602 Type *ScalarTy = Op->Scalars.front()->getType();
15603 auto It = MinBWs.find(Op);
15604 if (It != MinBWs.end())
15605 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
15606 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
15607 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15608 if (ScalarTy->isVectorTy()) {
15609 // Handle revec dead vector instructions.
15610 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15611 }
15612 };
15613 // Memoize the relationship between blocks, i.e. if there is (at least one)
15614 // non-vectorized call between the blocks. This allows to skip the analysis of
15615 // the same block paths multiple times.
15617 ParentOpParentToPreds;
15618 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
15619 BasicBlock *OpParent) {
15620 auto Key = std::make_pair(Root, OpParent);
15621 if (auto It = ParentOpParentToPreds.find(Key);
15622 It != ParentOpParentToPreds.end())
15623 return It->second;
15625 if (Pred)
15626 Worklist.push_back(Pred);
15627 else
15628 Worklist.append(pred_begin(Root), pred_end(Root));
15631 ParentsPairsToAdd;
15632 bool Res = false;
15633 auto Cleanup = make_scope_exit([&]() {
15634 for (const auto &KeyPair : ParentsPairsToAdd) {
15635 assert(!ParentOpParentToPreds.contains(KeyPair) &&
15636 "Should not have been added before.");
15637 ParentOpParentToPreds.try_emplace(KeyPair, Res);
15638 }
15639 });
15640 while (!Worklist.empty()) {
15641 BasicBlock *BB = Worklist.pop_back_val();
15642 if (BB == OpParent || !Visited.insert(BB).second)
15643 continue;
15644 auto Pair = std::make_pair(BB, OpParent);
15645 if (auto It = ParentOpParentToPreds.find(Pair);
15646 It != ParentOpParentToPreds.end()) {
15647 Res = It->second;
15648 return Res;
15649 }
15650 ParentsPairsToAdd.insert(Pair);
15651 unsigned BlockSize = BB->size();
15652 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
15653 return Res;
15654 Budget += BlockSize;
15655 if (Budget > BudgetLimit)
15656 return Res;
15657 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
15658 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
15659 BB->getTerminator()))
15660 return Res;
15661 Worklist.append(pred_begin(BB), pred_end(BB));
15662 }
15663 Res = true;
15664 return Res;
15665 };
15666 SmallVector<const TreeEntry *> LiveEntries(1, Root);
15667 while (!LiveEntries.empty()) {
15668 const TreeEntry *Entry = LiveEntries.pop_back_val();
15669 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
15670 if (Operands.empty())
15671 continue;
15672 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
15673 BasicBlock *Parent = LastInst->getParent();
15674 for (const TreeEntry *Op : Operands) {
15675 if (!Op->isGather())
15676 LiveEntries.push_back(Op);
15677 if (Entry->State == TreeEntry::SplitVectorize ||
15678 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
15679 (Op->isGather() && allConstant(Op->Scalars)))
15680 continue;
15681 Budget = 0;
15682 BasicBlock *Pred = nullptr;
15683 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15684 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15685 BasicBlock *OpParent;
15686 Instruction *OpLastInst;
15687 if (Op->isGather()) {
15688 assert(Entry->getOpcode() == Instruction::PHI &&
15689 "Expected phi node only.");
15690 OpParent = cast<PHINode>(Entry->getMainOp())
15691 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15692 OpLastInst = OpParent->getTerminator();
15693 for (Value *V : Op->Scalars) {
15694 auto *Inst = dyn_cast<Instruction>(V);
15695 if (!Inst)
15696 continue;
15697 if (isVectorized(V)) {
15698 OpParent = Inst->getParent();
15699 OpLastInst = Inst;
15700 break;
15701 }
15702 }
15703 } else {
15704 OpLastInst = EntriesToLastInstruction.at(Op);
15705 OpParent = OpLastInst->getParent();
15706 }
15707 // Check the call instructions within the same basic blocks.
15708 if (OpParent == Parent) {
15709 if (Entry->getOpcode() == Instruction::PHI) {
15710 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15711 AddCosts(Op);
15712 continue;
15713 }
15714 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15715 AddCosts(Op);
15716 continue;
15717 }
15718 // Check for call instruction in between blocks.
15719 // 1. Check entry's block to the head.
15720 if (Entry->getOpcode() != Instruction::PHI &&
15721 !CheckForNonVecCallsInSameBlock(
15722 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
15723 LastInst)) {
15724 AddCosts(Op);
15725 continue;
15726 }
15727 // 2. Check op's block from the end.
15728 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15729 OpParent->getTerminator())) {
15730 AddCosts(Op);
15731 continue;
15732 }
15733 // 3. Check the predecessors of entry's block till op's block.
15734 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15735 AddCosts(Op);
15736 continue;
15737 }
15738 }
15739 }
15740
15741 return Cost;
15742}
15743
15744/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
15745/// buildvector sequence.
15747 const InsertElementInst *IE2) {
15748 if (IE1 == IE2)
15749 return false;
15750 const auto *I1 = IE1;
15751 const auto *I2 = IE2;
15752 const InsertElementInst *PrevI1;
15753 const InsertElementInst *PrevI2;
15754 unsigned Idx1 = *getElementIndex(IE1);
15755 unsigned Idx2 = *getElementIndex(IE2);
15756 do {
15757 if (I2 == IE1)
15758 return true;
15759 if (I1 == IE2)
15760 return false;
15761 PrevI1 = I1;
15762 PrevI2 = I2;
15763 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15764 getElementIndex(I1).value_or(Idx2) != Idx2)
15765 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
15766 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
15767 getElementIndex(I2).value_or(Idx1) != Idx1)
15768 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
15769 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15770 llvm_unreachable("Two different buildvectors not expected.");
15771}
15772
15773namespace {
15774/// Returns incoming Value *, if the requested type is Value * too, or a default
15775/// value, otherwise.
15776struct ValueSelect {
15777 template <typename U>
15778 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
15779 return V;
15780 }
15781 template <typename U>
15782 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
15783 return U();
15784 }
15785};
15786} // namespace
15787
15788/// Does the analysis of the provided shuffle masks and performs the requested
15789/// actions on the vectors with the given shuffle masks. It tries to do it in
15790/// several steps.
15791/// 1. If the Base vector is not undef vector, resizing the very first mask to
15792/// have common VF and perform action for 2 input vectors (including non-undef
15793/// Base). Other shuffle masks are combined with the resulting after the 1 stage
15794/// and processed as a shuffle of 2 elements.
15795/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
15796/// action only for 1 vector with the given mask, if it is not the identity
15797/// mask.
15798/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
15799/// vectors, combing the masks properly between the steps.
15800template <typename T>
15802 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
15803 function_ref<unsigned(T *)> GetVF,
15804 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
15806 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
15807 SmallVector<int> Mask(ShuffleMask.begin()->second);
15808 auto VMIt = std::next(ShuffleMask.begin());
15809 T *Prev = nullptr;
15810 SmallBitVector UseMask =
15811 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15812 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
15813 if (!IsBaseUndef.all()) {
15814 // Base is not undef, need to combine it with the next subvectors.
15815 std::pair<T *, bool> Res =
15816 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
15817 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
15818 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15819 if (Mask[Idx] == PoisonMaskElem)
15820 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
15821 else
15822 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15823 }
15824 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
15825 assert((!V || GetVF(V) == Mask.size()) &&
15826 "Expected base vector of VF number of elements.");
15827 Prev = Action(Mask, {nullptr, Res.first});
15828 } else if (ShuffleMask.size() == 1) {
15829 // Base is undef and only 1 vector is shuffled - perform the action only for
15830 // single vector, if the mask is not the identity mask.
15831 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15832 /*ForSingleMask=*/true);
15833 if (Res.second)
15834 // Identity mask is found.
15835 Prev = Res.first;
15836 else
15837 Prev = Action(Mask, {ShuffleMask.begin()->first});
15838 } else {
15839 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
15840 // shuffles step by step, combining shuffle between the steps.
15841 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15842 unsigned Vec2VF = GetVF(VMIt->first);
15843 if (Vec1VF == Vec2VF) {
15844 // No need to resize the input vectors since they are of the same size, we
15845 // can shuffle them directly.
15846 ArrayRef<int> SecMask = VMIt->second;
15847 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15848 if (SecMask[I] != PoisonMaskElem) {
15849 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15850 Mask[I] = SecMask[I] + Vec1VF;
15851 }
15852 }
15853 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15854 } else {
15855 // Vectors of different sizes - resize and reshuffle.
15856 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15857 /*ForSingleMask=*/false);
15858 std::pair<T *, bool> Res2 =
15859 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15860 ArrayRef<int> SecMask = VMIt->second;
15861 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15862 if (Mask[I] != PoisonMaskElem) {
15863 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15864 if (Res1.second)
15865 Mask[I] = I;
15866 } else if (SecMask[I] != PoisonMaskElem) {
15867 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15868 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
15869 }
15870 }
15871 Prev = Action(Mask, {Res1.first, Res2.first});
15872 }
15873 VMIt = std::next(VMIt);
15874 }
15875 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
15876 // Perform requested actions for the remaining masks/vectors.
15877 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
15878 // Shuffle other input vectors, if any.
15879 std::pair<T *, bool> Res =
15880 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15881 ArrayRef<int> SecMask = VMIt->second;
15882 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15883 if (SecMask[I] != PoisonMaskElem) {
15884 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
15885 "Multiple uses of scalars.");
15886 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
15887 } else if (Mask[I] != PoisonMaskElem) {
15888 Mask[I] = I;
15889 }
15890 }
15891 Prev = Action(Mask, {Prev, Res.first});
15892 }
15893 return Prev;
15894}
15895
15896namespace {
15897/// Data type for handling buildvector sequences with the reused scalars from
15898/// other tree entries.
15899template <typename T> struct ShuffledInsertData {
15900 /// List of insertelements to be replaced by shuffles.
15901 SmallVector<InsertElementInst *> InsertElements;
15902 /// The parent vectors and shuffle mask for the given list of inserts.
15903 MapVector<T, SmallVector<int>> ValueMasks;
15904};
15905} // namespace
15906
15908 InstructionCost ReductionCost) {
15909 InstructionCost Cost = ReductionCost;
15910 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
15911 << VectorizableTree.size() << ".\n");
15912
15913 SmallPtrSet<Value *, 4> CheckedExtracts;
15914 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
15915 TreeEntry &TE = *VectorizableTree[I];
15916 // No need to count the cost for combined entries, they are combined and
15917 // just skip their cost.
15918 if (TE.State == TreeEntry::CombinedVectorize) {
15919 LLVM_DEBUG(
15920 dbgs() << "SLP: Skipping cost for combined node that starts with "
15921 << *TE.Scalars[0] << ".\n";
15922 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
15923 continue;
15924 }
15925 if (TE.hasState() &&
15926 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15927 if (const TreeEntry *E =
15928 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15929 E && E->getVectorFactor() == TE.getVectorFactor()) {
15930 // Some gather nodes might be absolutely the same as some vectorizable
15931 // nodes after reordering, need to handle it.
15932 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
15933 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
15934 << "SLP: Current total cost = " << Cost << "\n");
15935 continue;
15936 }
15937 }
15938
15939 // Exclude cost of gather loads nodes which are not used. These nodes were
15940 // built as part of the final attempt to vectorize gathered loads.
15941 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15942 "Expected gather nodes with users only.");
15943
15944 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
15945 Cost += C;
15946 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
15947 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
15948 << "SLP: Current total cost = " << Cost << "\n");
15949 }
15950
15951 if (Cost >= -SLPCostThreshold &&
15952 none_of(ExternalUses, [](const ExternalUser &EU) {
15953 return isa_and_nonnull<InsertElementInst>(EU.User);
15954 }))
15955 return Cost;
15956
15957 SmallPtrSet<Value *, 16> ExtractCostCalculated;
15958 InstructionCost ExtractCost = 0;
15960 SmallVector<APInt> DemandedElts;
15961 SmallDenseSet<Value *, 4> UsedInserts;
15963 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
15965 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
15966 // Keep track {Scalar, Index, User} tuple.
15967 // On AArch64, this helps in fusing a mov instruction, associated with
15968 // extractelement, with fmul in the backend so that extractelement is free.
15970 for (ExternalUser &EU : ExternalUses) {
15971 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
15972 }
15973 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
15974 for (ExternalUser &EU : ExternalUses) {
15975 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
15976 << EU.E.Idx << " in lane " << EU.Lane << "\n");
15977 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
15978 else dbgs() << " User: nullptr\n");
15979 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
15980
15981 // Uses by ephemeral values are free (because the ephemeral value will be
15982 // removed prior to code generation, and so the extraction will be
15983 // removed as well).
15984 if (EphValues.count(EU.User))
15985 continue;
15986
15987 // Check if the scalar for the given user or all users is accounted already.
15988 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
15989 (EU.User &&
15990 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
15991 continue;
15992
15993 // Used in unreachable blocks or in EH pads (rarely executed) or is
15994 // terminated with unreachable instruction.
15995 if (BasicBlock *UserParent =
15996 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
15997 UserParent &&
15998 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
15999 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
16000 continue;
16001
16002 // We only add extract cost once for the same scalar.
16003 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16004 !ExtractCostCalculated.insert(EU.Scalar).second)
16005 continue;
16006
16007 // No extract cost for vector "scalar" if REVEC is disabled
16008 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16009 continue;
16010
16011 // If found user is an insertelement, do not calculate extract cost but try
16012 // to detect it as a final shuffled/identity match.
16013 // TODO: what if a user is insertvalue when REVEC is enabled?
16014 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16015 VU && VU->getOperand(1) == EU.Scalar) {
16016 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16017 if (!UsedInserts.insert(VU).second)
16018 continue;
16019 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16020 if (InsertIdx) {
16021 const TreeEntry *ScalarTE = &EU.E;
16022 auto *It = find_if(
16023 ShuffledInserts,
16024 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16025 // Checks if 2 insertelements are from the same buildvector.
16026 InsertElementInst *VecInsert = Data.InsertElements.front();
16028 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16029 Value *Op0 = II->getOperand(0);
16030 if (isVectorized(II) && !isVectorized(Op0))
16031 return nullptr;
16032 return Op0;
16033 });
16034 });
16035 int VecId = -1;
16036 if (It == ShuffledInserts.end()) {
16037 auto &Data = ShuffledInserts.emplace_back();
16038 Data.InsertElements.emplace_back(VU);
16039 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16040 VecId = ShuffledInserts.size() - 1;
16041 auto It = MinBWs.find(ScalarTE);
16042 if (It != MinBWs.end() &&
16043 VectorCasts
16044 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16045 .second) {
16046 unsigned BWSz = It->second.first;
16047 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16048 unsigned VecOpcode;
16049 if (DstBWSz < BWSz)
16050 VecOpcode = Instruction::Trunc;
16051 else
16052 VecOpcode =
16053 It->second.second ? Instruction::SExt : Instruction::ZExt;
16055 InstructionCost C = TTI->getCastInstrCost(
16056 VecOpcode, FTy,
16057 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16058 FTy->getNumElements()),
16060 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16061 << " for extending externally used vector with "
16062 "non-equal minimum bitwidth.\n");
16063 Cost += C;
16064 }
16065 } else {
16066 if (isFirstInsertElement(VU, It->InsertElements.front()))
16067 It->InsertElements.front() = VU;
16068 VecId = std::distance(ShuffledInserts.begin(), It);
16069 }
16070 int InIdx = *InsertIdx;
16071 SmallVectorImpl<int> &Mask =
16072 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16073 if (Mask.empty())
16074 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16075 Mask[InIdx] = EU.Lane;
16076 DemandedElts[VecId].setBit(InIdx);
16077 continue;
16078 }
16079 }
16080 }
16081
16083 // If we plan to rewrite the tree in a smaller type, we will need to sign
16084 // extend the extracted value back to the original type. Here, we account
16085 // for the extract and the added cost of the sign extend if needed.
16086 InstructionCost ExtraCost = TTI::TCC_Free;
16087 auto *ScalarTy = EU.Scalar->getType();
16088 const unsigned BundleWidth = EU.E.getVectorFactor();
16089 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16090 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16091 const TreeEntry *Entry = &EU.E;
16092 auto It = MinBWs.find(Entry);
16093 if (It != MinBWs.end()) {
16094 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16095 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16096 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16097 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16098 ? Instruction::ZExt
16099 : Instruction::SExt;
16100 VecTy = getWidenedType(MinTy, BundleWidth);
16101 ExtraCost =
16102 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16103 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16104 << ExtraCost << "\n");
16105 } else {
16106 ExtraCost =
16107 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16108 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16109 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16110 << *VecTy << ": " << ExtraCost << "\n");
16111 }
16112 // Leave the scalar instructions as is if they are cheaper than extracts.
16113 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16114 Entry->getOpcode() == Instruction::Load) {
16115 // Checks if the user of the external scalar is phi in loop body.
16116 auto IsPhiInLoop = [&](const ExternalUser &U) {
16117 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16118 auto *I = cast<Instruction>(U.Scalar);
16119 const Loop *L = LI->getLoopFor(Phi->getParent());
16120 return L && (Phi->getParent() == I->getParent() ||
16121 L == LI->getLoopFor(I->getParent()));
16122 }
16123 return false;
16124 };
16125 if (!ValueToExtUses) {
16126 ValueToExtUses.emplace();
16127 for (const auto &P : enumerate(ExternalUses)) {
16128 // Ignore phis in loops.
16129 if (IsPhiInLoop(P.value()))
16130 continue;
16131
16132 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16133 }
16134 }
16135 // Can use original instruction, if no operands vectorized or they are
16136 // marked as externally used already.
16137 auto *Inst = cast<Instruction>(EU.Scalar);
16138 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16139 auto OperandIsScalar = [&](Value *V) {
16140 if (!isVectorized(V)) {
16141 // Some extractelements might be not vectorized, but
16142 // transformed into shuffle and removed from the function,
16143 // consider it here.
16144 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16145 return !EE->hasOneUse() || !MustGather.contains(EE);
16146 return true;
16147 }
16148 return ValueToExtUses->contains(V);
16149 };
16150 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16151 bool CanBeUsedAsScalarCast = false;
16152 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16153 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16154 Op && all_of(Op->operands(), OperandIsScalar)) {
16155 InstructionCost OpCost =
16156 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16157 ? TTI->getInstructionCost(Op, CostKind)
16158 : 0;
16159 if (ScalarCost + OpCost <= ExtraCost) {
16160 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16161 ScalarCost += OpCost;
16162 }
16163 }
16164 }
16165 if (CanBeUsedAsScalar) {
16166 bool KeepScalar = ScalarCost <= ExtraCost;
16167 // Try to keep original scalar if the user is the phi node from the same
16168 // block as the root phis, currently vectorized. It allows to keep
16169 // better ordering info of PHIs, being vectorized currently.
16170 bool IsProfitablePHIUser =
16171 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16172 VectorizableTree.front()->Scalars.size() > 2)) &&
16173 VectorizableTree.front()->hasState() &&
16174 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16175 !Inst->hasNUsesOrMore(UsesLimit) &&
16176 none_of(Inst->users(),
16177 [&](User *U) {
16178 auto *PHIUser = dyn_cast<PHINode>(U);
16179 return (!PHIUser ||
16180 PHIUser->getParent() !=
16181 cast<Instruction>(
16182 VectorizableTree.front()->getMainOp())
16183 ->getParent()) &&
16184 !isVectorized(U);
16185 }) &&
16186 count_if(Entry->Scalars, [&](Value *V) {
16187 return ValueToExtUses->contains(V);
16188 }) <= 2;
16189 if (IsProfitablePHIUser) {
16190 KeepScalar = true;
16191 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16192 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16193 (!GatheredLoadsEntriesFirst.has_value() ||
16194 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16195 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16196 return ValueToExtUses->contains(V);
16197 });
16198 auto It = ExtractsCount.find(Entry);
16199 if (It != ExtractsCount.end()) {
16200 assert(ScalarUsesCount >= It->getSecond().size() &&
16201 "Expected total number of external uses not less than "
16202 "number of scalar uses.");
16203 ScalarUsesCount -= It->getSecond().size();
16204 }
16205 // Keep original scalar if number of externally used instructions in
16206 // the same entry is not power of 2. It may help to do some extra
16207 // vectorization for now.
16208 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16209 }
16210 if (KeepScalar) {
16211 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16212 for (Value *V : Inst->operands()) {
16213 auto It = ValueToExtUses->find(V);
16214 if (It != ValueToExtUses->end()) {
16215 // Replace all uses to avoid compiler crash.
16216 ExternalUses[It->second].User = nullptr;
16217 }
16218 }
16219 ExtraCost = ScalarCost;
16220 if (!IsPhiInLoop(EU))
16221 ExtractsCount[Entry].insert(Inst);
16222 if (CanBeUsedAsScalarCast) {
16223 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16224 // Update the users of the operands of the cast operand to avoid
16225 // compiler crash.
16226 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16227 for (Value *V : IOp->operands()) {
16228 auto It = ValueToExtUses->find(V);
16229 if (It != ValueToExtUses->end()) {
16230 // Replace all uses to avoid compiler crash.
16231 ExternalUses[It->second].User = nullptr;
16232 }
16233 }
16234 }
16235 }
16236 }
16237 }
16238 }
16239
16240 ExtractCost += ExtraCost;
16241 }
16242 // Insert externals for extract of operands of casts to be emitted as scalars
16243 // instead of extractelement.
16244 for (Value *V : ScalarOpsFromCasts) {
16245 ExternalUsesAsOriginalScalar.insert(V);
16246 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
16247 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
16248 TEs.front()->findLaneForValue(V));
16249 }
16250 }
16251 // Add reduced value cost, if resized.
16252 if (!VectorizedVals.empty()) {
16253 const TreeEntry &Root = *VectorizableTree.front();
16254 auto BWIt = MinBWs.find(&Root);
16255 if (BWIt != MinBWs.end()) {
16256 Type *DstTy = Root.Scalars.front()->getType();
16257 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
16258 unsigned SrcSz =
16259 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16260 if (OriginalSz != SrcSz) {
16261 unsigned Opcode = Instruction::Trunc;
16262 if (OriginalSz > SrcSz)
16263 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16264 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
16265 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16266 assert(SLPReVec && "Only supported by REVEC.");
16267 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
16268 }
16269 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16272 }
16273 }
16274 }
16275
16276 Cost += ExtractCost;
16277 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
16278 bool ForSingleMask) {
16279 InstructionCost C = 0;
16280 unsigned VF = Mask.size();
16281 unsigned VecVF = TE->getVectorFactor();
16282 bool HasLargeIndex =
16283 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
16284 if ((VF != VecVF && HasLargeIndex) ||
16286
16287 if (HasLargeIndex) {
16288 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
16289 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16290 OrigMask.begin());
16292 getWidenedType(TE->getMainOp()->getType(), VecVF),
16293 OrigMask);
16294 LLVM_DEBUG(
16295 dbgs() << "SLP: Adding cost " << C
16296 << " for final shuffle of insertelement external users.\n";
16297 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16298 Cost += C;
16299 return std::make_pair(TE, true);
16300 }
16301
16302 if (!ForSingleMask) {
16303 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16304 for (unsigned I = 0; I < VF; ++I) {
16305 if (Mask[I] != PoisonMaskElem)
16306 ResizeMask[Mask[I]] = Mask[I];
16307 }
16308 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
16311 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
16312 LLVM_DEBUG(
16313 dbgs() << "SLP: Adding cost " << C
16314 << " for final shuffle of insertelement external users.\n";
16315 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16316
16317 Cost += C;
16318 }
16319 }
16320 return std::make_pair(TE, false);
16321 };
16322 // Calculate the cost of the reshuffled vectors, if any.
16323 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16324 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
16325 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16326 unsigned VF = 0;
16327 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
16329 assert((TEs.size() == 1 || TEs.size() == 2) &&
16330 "Expected exactly 1 or 2 tree entries.");
16331 if (TEs.size() == 1) {
16332 if (VF == 0)
16333 VF = TEs.front()->getVectorFactor();
16334 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16335 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
16336 !all_of(enumerate(Mask), [=](const auto &Data) {
16337 return Data.value() == PoisonMaskElem ||
16338 (Data.index() < VF &&
16339 static_cast<int>(Data.index()) == Data.value());
16340 })) {
16343 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16344 << " for final shuffle of insertelement "
16345 "external users.\n";
16346 TEs.front()->dump();
16347 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16348 Cost += C;
16349 }
16350 } else {
16351 if (VF == 0) {
16352 if (TEs.front() &&
16353 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16354 VF = TEs.front()->getVectorFactor();
16355 else
16356 VF = Mask.size();
16357 }
16358 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16360 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
16361 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16362 << " for final shuffle of vector node and external "
16363 "insertelement users.\n";
16364 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16365 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16366 Cost += C;
16367 }
16368 VF = Mask.size();
16369 return TEs.back();
16370 };
16372 MutableArrayRef(Vector.data(), Vector.size()), Base,
16373 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16374 EstimateShufflesCost);
16375 InstructionCost InsertCost = TTI->getScalarizationOverhead(
16377 ShuffledInserts[I].InsertElements.front()->getType()),
16378 DemandedElts[I],
16379 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
16380 Cost -= InsertCost;
16381 }
16382
16383 // Add the cost for reduced value resize (if required).
16384 if (ReductionBitWidth != 0) {
16385 assert(UserIgnoreList && "Expected reduction tree.");
16386 const TreeEntry &E = *VectorizableTree.front();
16387 auto It = MinBWs.find(&E);
16388 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16389 unsigned SrcSize = It->second.first;
16390 unsigned DstSize = ReductionBitWidth;
16391 unsigned Opcode = Instruction::Trunc;
16392 if (SrcSize < DstSize) {
16393 bool IsArithmeticExtendedReduction =
16394 all_of(*UserIgnoreList, [](Value *V) {
16395 auto *I = cast<Instruction>(V);
16396 return is_contained({Instruction::Add, Instruction::FAdd,
16397 Instruction::Mul, Instruction::FMul,
16398 Instruction::And, Instruction::Or,
16399 Instruction::Xor},
16400 I->getOpcode());
16401 });
16402 if (IsArithmeticExtendedReduction)
16403 Opcode =
16404 Instruction::BitCast; // Handle it by getExtendedReductionCost
16405 else
16406 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16407 }
16408 if (Opcode != Instruction::BitCast) {
16409 auto *SrcVecTy =
16410 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16411 auto *DstVecTy =
16412 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16413 TTI::CastContextHint CCH = getCastContextHint(E);
16414 InstructionCost CastCost;
16415 switch (E.getOpcode()) {
16416 case Instruction::SExt:
16417 case Instruction::ZExt:
16418 case Instruction::Trunc: {
16419 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16420 CCH = getCastContextHint(*OpTE);
16421 break;
16422 }
16423 default:
16424 break;
16425 }
16426 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16428 Cost += CastCost;
16429 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
16430 << " for final resize for reduction from " << SrcVecTy
16431 << " to " << DstVecTy << "\n";
16432 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16433 }
16434 }
16435 }
16436
16437 std::optional<InstructionCost> SpillCost;
16438 if (Cost < -SLPCostThreshold) {
16439 SpillCost = getSpillCost();
16440 Cost += *SpillCost;
16441 }
16442#ifndef NDEBUG
16443 SmallString<256> Str;
16444 {
16445 raw_svector_ostream OS(Str);
16446 OS << "SLP: Spill Cost = ";
16447 if (SpillCost)
16448 OS << *SpillCost;
16449 else
16450 OS << "<skipped>";
16451 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
16452 << "SLP: Total Cost = " << Cost << ".\n";
16453 }
16454 LLVM_DEBUG(dbgs() << Str);
16455 if (ViewSLPTree)
16456 ViewGraph(this, "SLP" + F->getName(), false, Str);
16457#endif
16458
16459 return Cost;
16460}
16461
16462/// Tries to find extractelement instructions with constant indices from fixed
16463/// vector type and gather such instructions into a bunch, which highly likely
16464/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16465/// successful, the matched scalars are replaced by poison values in \p VL for
16466/// future analysis.
16467std::optional<TTI::ShuffleKind>
16468BoUpSLP::tryToGatherSingleRegisterExtractElements(
16470 // Scan list of gathered scalars for extractelements that can be represented
16471 // as shuffles.
16473 SmallVector<int> UndefVectorExtracts;
16474 for (int I = 0, E = VL.size(); I < E; ++I) {
16475 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16476 if (!EI) {
16477 if (isa<UndefValue>(VL[I]))
16478 UndefVectorExtracts.push_back(I);
16479 continue;
16480 }
16481 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16482 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16483 continue;
16484 std::optional<unsigned> Idx = getExtractIndex(EI);
16485 // Undefined index.
16486 if (!Idx) {
16487 UndefVectorExtracts.push_back(I);
16488 continue;
16489 }
16490 if (Idx >= VecTy->getNumElements()) {
16491 UndefVectorExtracts.push_back(I);
16492 continue;
16493 }
16494 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
16495 ExtractMask.reset(*Idx);
16496 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
16497 UndefVectorExtracts.push_back(I);
16498 continue;
16499 }
16500 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
16501 }
16502 // Sort the vector operands by the maximum number of uses in extractelements.
16504 VectorOpToIdx.takeVector();
16505 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
16506 return P1.second.size() > P2.second.size();
16507 });
16508 // Find the best pair of the vectors or a single vector.
16509 const int UndefSz = UndefVectorExtracts.size();
16510 unsigned SingleMax = 0;
16511 unsigned PairMax = 0;
16512 if (!Vectors.empty()) {
16513 SingleMax = Vectors.front().second.size() + UndefSz;
16514 if (Vectors.size() > 1) {
16515 auto *ItNext = std::next(Vectors.begin());
16516 PairMax = SingleMax + ItNext->second.size();
16517 }
16518 }
16519 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16520 return std::nullopt;
16521 // Check if better to perform a shuffle of 2 vectors or just of a single
16522 // vector.
16523 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
16524 SmallVector<Value *> GatheredExtracts(
16525 VL.size(), PoisonValue::get(VL.front()->getType()));
16526 if (SingleMax >= PairMax && SingleMax) {
16527 for (int Idx : Vectors.front().second)
16528 std::swap(GatheredExtracts[Idx], VL[Idx]);
16529 } else if (!Vectors.empty()) {
16530 for (unsigned Idx : {0, 1})
16531 for (int Idx : Vectors[Idx].second)
16532 std::swap(GatheredExtracts[Idx], VL[Idx]);
16533 }
16534 // Add extracts from undefs too.
16535 for (int Idx : UndefVectorExtracts)
16536 std::swap(GatheredExtracts[Idx], VL[Idx]);
16537 // Check that gather of extractelements can be represented as just a
16538 // shuffle of a single/two vectors the scalars are extracted from.
16539 std::optional<TTI::ShuffleKind> Res =
16540 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
16541 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16542 // TODO: try to check other subsets if possible.
16543 // Restore the original VL if attempt was not successful.
16544 copy(SavedVL, VL.begin());
16545 return std::nullopt;
16546 }
16547 // Restore unused scalars from mask, if some of the extractelements were not
16548 // selected for shuffle.
16549 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
16550 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
16551 isa<UndefValue>(GatheredExtracts[I])) {
16552 std::swap(VL[I], GatheredExtracts[I]);
16553 continue;
16554 }
16555 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16556 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16557 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16558 is_contained(UndefVectorExtracts, I))
16559 continue;
16560 }
16561 return Res;
16562}
16563
16564/// Tries to find extractelement instructions with constant indices from fixed
16565/// vector type and gather such instructions into a bunch, which highly likely
16566/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16567/// successful, the matched scalars are replaced by poison values in \p VL for
16568/// future analysis.
16570BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16571 SmallVectorImpl<int> &Mask,
16572 unsigned NumParts) const {
16573 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
16574 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
16575 Mask.assign(VL.size(), PoisonMaskElem);
16576 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16577 for (unsigned Part : seq<unsigned>(NumParts)) {
16578 // Scan list of gathered scalars for extractelements that can be represented
16579 // as shuffles.
16580 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
16581 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
16582 SmallVector<int> SubMask;
16583 std::optional<TTI::ShuffleKind> Res =
16584 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16585 ShufflesRes[Part] = Res;
16586 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
16587 }
16588 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
16589 return Res.has_value();
16590 }))
16591 ShufflesRes.clear();
16592 return ShufflesRes;
16593}
16594
16595std::optional<TargetTransformInfo::ShuffleKind>
16596BoUpSLP::isGatherShuffledSingleRegisterEntry(
16597 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
16598 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
16599 Entries.clear();
16600 // TODO: currently checking only for Scalars in the tree entry, need to count
16601 // reused elements too for better cost estimation.
16602 auto GetUserEntry = [&](const TreeEntry *TE) {
16603 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16604 TE = TE->UserTreeIndex.UserTE;
16605 if (TE == VectorizableTree.front().get())
16606 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
16607 return TE->UserTreeIndex;
16608 };
16609 auto HasGatherUser = [&](const TreeEntry *TE) {
16610 while (TE->Idx != 0 && TE->UserTreeIndex) {
16611 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16612 return true;
16613 TE = TE->UserTreeIndex.UserTE;
16614 }
16615 return false;
16616 };
16617 const EdgeInfo TEUseEI = GetUserEntry(TE);
16618 if (!TEUseEI)
16619 return std::nullopt;
16620 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16621 const BasicBlock *TEInsertBlock = nullptr;
16622 // Main node of PHI entries keeps the correct order of operands/incoming
16623 // blocks.
16624 if (auto *PHI = dyn_cast_or_null<PHINode>(
16625 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
16626 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16627 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16628 TEInsertPt = TEInsertBlock->getTerminator();
16629 } else {
16630 TEInsertBlock = TEInsertPt->getParent();
16631 }
16632 if (!DT->isReachableFromEntry(TEInsertBlock))
16633 return std::nullopt;
16634 auto *NodeUI = DT->getNode(TEInsertBlock);
16635 assert(NodeUI && "Should only process reachable instructions");
16636 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
16637 auto CheckOrdering = [&](const Instruction *InsertPt) {
16638 // Argument InsertPt is an instruction where vector code for some other
16639 // tree entry (one that shares one or more scalars with TE) is going to be
16640 // generated. This lambda returns true if insertion point of vector code
16641 // for the TE dominates that point (otherwise dependency is the other way
16642 // around). The other node is not limited to be of a gather kind. Gather
16643 // nodes are not scheduled and their vector code is inserted before their
16644 // first user. If user is PHI, that is supposed to be at the end of a
16645 // predecessor block. Otherwise it is the last instruction among scalars of
16646 // the user node. So, instead of checking dependency between instructions
16647 // themselves, we check dependency between their insertion points for vector
16648 // code (since each scalar instruction ends up as a lane of a vector
16649 // instruction).
16650 const BasicBlock *InsertBlock = InsertPt->getParent();
16651 auto *NodeEUI = DT->getNode(InsertBlock);
16652 if (!NodeEUI)
16653 return false;
16654 assert((NodeUI == NodeEUI) ==
16655 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16656 "Different nodes should have different DFS numbers");
16657 // Check the order of the gather nodes users.
16658 if (TEInsertPt->getParent() != InsertBlock &&
16659 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16660 return false;
16661 if (TEInsertPt->getParent() == InsertBlock &&
16662 TEInsertPt->comesBefore(InsertPt))
16663 return false;
16664 return true;
16665 };
16666 // Find all tree entries used by the gathered values. If no common entries
16667 // found - not a shuffle.
16668 // Here we build a set of tree nodes for each gathered value and trying to
16669 // find the intersection between these sets. If we have at least one common
16670 // tree node for each gathered value - we have just a permutation of the
16671 // single vector. If we have 2 different sets, we're in situation where we
16672 // have a permutation of 2 input vectors.
16674 SmallDenseMap<Value *, int> UsedValuesEntry;
16675 SmallPtrSet<const Value *, 16> VisitedValue;
16676 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
16677 // The node is reused - exit.
16678 if ((TEPtr->getVectorFactor() != VL.size() &&
16679 TEPtr->Scalars.size() != VL.size()) ||
16680 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
16681 return false;
16682 UsedTEs.clear();
16683 UsedTEs.emplace_back().insert(TEPtr);
16684 for (Value *V : VL) {
16685 if (isConstant(V))
16686 continue;
16687 UsedValuesEntry.try_emplace(V, 0);
16688 }
16689 return true;
16690 };
16691 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
16692 unsigned EdgeIdx) {
16693 const TreeEntry *Ptr1 = User1;
16694 const TreeEntry *Ptr2 = User2;
16695 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16696 while (Ptr2) {
16697 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
16698 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16699 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16700 }
16701 while (Ptr1) {
16702 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16703 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16704 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
16705 return Idx < It->second;
16706 }
16707 return false;
16708 };
16709 for (Value *V : VL) {
16710 if (isConstant(V) || !VisitedValue.insert(V).second)
16711 continue;
16712 // Build a list of tree entries where V is used.
16713 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16714 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16715 if (TEPtr == TE || TEPtr->Idx == 0)
16716 continue;
16717 assert(any_of(TEPtr->Scalars,
16718 [&](Value *V) { return GatheredScalars.contains(V); }) &&
16719 "Must contain at least single gathered value.");
16720 assert(TEPtr->UserTreeIndex &&
16721 "Expected only single user of a gather node.");
16722 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16723
16724 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16725 UseEI.UserTE->hasState())
16726 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
16727 : nullptr;
16728 Instruction *InsertPt =
16729 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
16730 : &getLastInstructionInBundle(UseEI.UserTE);
16731 if (TEInsertPt == InsertPt) {
16732 // Check nodes, which might be emitted first.
16733 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16734 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16735 TEUseEI.UserTE->isAltShuffle()) &&
16736 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
16737 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16738 (UseEI.UserTE->hasState() &&
16739 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16740 !UseEI.UserTE->isAltShuffle()) ||
16741 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
16742 continue;
16743 }
16744
16745 // If the schedulable insertion point is used in multiple entries - just
16746 // exit, no known ordering at this point, available only after real
16747 // scheduling.
16748 if (!doesNotNeedToBeScheduled(InsertPt) &&
16749 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16750 continue;
16751 // If the users are the PHI nodes with the same incoming blocks - skip.
16752 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16753 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16754 UseEI.UserTE->State == TreeEntry::Vectorize &&
16755 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16756 TEUseEI.UserTE != UseEI.UserTE)
16757 continue;
16758 // If 2 gathers are operands of the same entry (regardless of whether
16759 // user is PHI or else), compare operands indices, use the earlier one
16760 // as the base.
16761 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16762 continue;
16763 // If the user instruction is used for some reason in different
16764 // vectorized nodes - make it depend on index.
16765 if (TEUseEI.UserTE != UseEI.UserTE &&
16766 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16767 HasGatherUser(TEUseEI.UserTE)))
16768 continue;
16769 // If the user node is the operand of the other user node - skip.
16770 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16771 continue;
16772 }
16773
16774 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16775 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16776 UseEI.UserTE->doesNotNeedToSchedule() &&
16777 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
16778 continue;
16779 // Check if the user node of the TE comes after user node of TEPtr,
16780 // otherwise TEPtr depends on TE.
16781 if ((TEInsertBlock != InsertPt->getParent() ||
16782 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16783 !CheckOrdering(InsertPt))
16784 continue;
16785 // The node is reused - exit.
16786 if (CheckAndUseSameNode(TEPtr))
16787 break;
16788 VToTEs.insert(TEPtr);
16789 }
16790 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
16791 const auto *It = find_if(
16792 VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
16793 if (It != VTEs.end()) {
16794 const TreeEntry *VTE = *It;
16795 if (none_of(TE->CombinedEntriesWithIndices,
16796 [&](const auto &P) { return P.first == VTE->Idx; })) {
16797 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16798 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16799 continue;
16800 }
16801 // The node is reused - exit.
16802 if (CheckAndUseSameNode(VTE))
16803 break;
16804 VToTEs.insert(VTE);
16805 }
16806 }
16807 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
16808 const TreeEntry *VTE = VTEs.front();
16809 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16810 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16811 VTEs = VTEs.drop_front();
16812 // Iterate through all vectorized nodes.
16813 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
16814 return MTE->State == TreeEntry::Vectorize;
16815 });
16816 if (MIt == VTEs.end())
16817 continue;
16818 VTE = *MIt;
16819 }
16820 if (none_of(TE->CombinedEntriesWithIndices,
16821 [&](const auto &P) { return P.first == VTE->Idx; })) {
16822 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16823 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16824 continue;
16825 }
16826 // The node is reused - exit.
16827 if (CheckAndUseSameNode(VTE))
16828 break;
16829 VToTEs.insert(VTE);
16830 }
16831 if (VToTEs.empty())
16832 continue;
16833 if (UsedTEs.empty()) {
16834 // The first iteration, just insert the list of nodes to vector.
16835 UsedTEs.push_back(VToTEs);
16836 UsedValuesEntry.try_emplace(V, 0);
16837 } else {
16838 // Need to check if there are any previously used tree nodes which use V.
16839 // If there are no such nodes, consider that we have another one input
16840 // vector.
16841 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16842 unsigned Idx = 0;
16843 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16844 // Do we have a non-empty intersection of previously listed tree entries
16845 // and tree entries using current V?
16846 set_intersect(VToTEs, Set);
16847 if (!VToTEs.empty()) {
16848 // Yes, write the new subset and continue analysis for the next
16849 // scalar.
16850 Set.swap(VToTEs);
16851 break;
16852 }
16853 VToTEs = SavedVToTEs;
16854 ++Idx;
16855 }
16856 // No non-empty intersection found - need to add a second set of possible
16857 // source vectors.
16858 if (Idx == UsedTEs.size()) {
16859 // If the number of input vectors is greater than 2 - not a permutation,
16860 // fallback to the regular gather.
16861 // TODO: support multiple reshuffled nodes.
16862 if (UsedTEs.size() == 2)
16863 continue;
16864 UsedTEs.push_back(SavedVToTEs);
16865 Idx = UsedTEs.size() - 1;
16866 }
16867 UsedValuesEntry.try_emplace(V, Idx);
16868 }
16869 }
16870
16871 if (UsedTEs.empty()) {
16872 Entries.clear();
16873 return std::nullopt;
16874 }
16875
16876 unsigned VF = 0;
16877 if (UsedTEs.size() == 1) {
16878 // Keep the order to avoid non-determinism.
16879 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
16880 UsedTEs.front().end());
16881 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
16882 return TE1->Idx < TE2->Idx;
16883 });
16884 // Try to find the perfect match in another gather node at first.
16885 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
16886 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
16887 });
16888 if (It != FirstEntries.end() &&
16889 ((*It)->getVectorFactor() == VL.size() ||
16890 ((*It)->getVectorFactor() == TE->Scalars.size() &&
16891 TE->ReuseShuffleIndices.size() == VL.size() &&
16892 (*It)->isSame(TE->Scalars)))) {
16893 Entries.push_back(*It);
16894 if ((*It)->getVectorFactor() == VL.size()) {
16895 std::iota(std::next(Mask.begin(), Part * VL.size()),
16896 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
16897 } else {
16898 SmallVector<int> CommonMask = TE->getCommonMask();
16899 copy(CommonMask, Mask.begin());
16900 }
16901 // Clear undef scalars.
16902 for (unsigned I : seq<unsigned>(VL.size()))
16903 if (isa<PoisonValue>(VL[I]))
16904 Mask[Part * VL.size() + I] = PoisonMaskElem;
16906 }
16907 // No perfect match, just shuffle, so choose the first tree node from the
16908 // tree.
16909 Entries.push_back(FirstEntries.front());
16910 // Update mapping between values and corresponding tree entries.
16911 for (auto &P : UsedValuesEntry)
16912 P.second = 0;
16913 VF = FirstEntries.front()->getVectorFactor();
16914 } else {
16915 // Try to find nodes with the same vector factor.
16916 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
16917 // Keep the order of tree nodes to avoid non-determinism.
16918 DenseMap<int, const TreeEntry *> VFToTE;
16919 for (const TreeEntry *TE : UsedTEs.front()) {
16920 unsigned VF = TE->getVectorFactor();
16921 auto It = VFToTE.find(VF);
16922 if (It != VFToTE.end()) {
16923 if (It->second->Idx > TE->Idx)
16924 It->getSecond() = TE;
16925 continue;
16926 }
16927 VFToTE.try_emplace(VF, TE);
16928 }
16929 // Same, keep the order to avoid non-determinism.
16930 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
16931 UsedTEs.back().end());
16932 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
16933 return TE1->Idx < TE2->Idx;
16934 });
16935 for (const TreeEntry *TE : SecondEntries) {
16936 auto It = VFToTE.find(TE->getVectorFactor());
16937 if (It != VFToTE.end()) {
16938 VF = It->first;
16939 Entries.push_back(It->second);
16940 Entries.push_back(TE);
16941 break;
16942 }
16943 }
16944 // No 2 source vectors with the same vector factor - just choose 2 with max
16945 // index.
16946 if (Entries.empty()) {
16948 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
16949 return TE1->Idx < TE2->Idx;
16950 }));
16951 Entries.push_back(SecondEntries.front());
16952 VF = std::max(Entries.front()->getVectorFactor(),
16953 Entries.back()->getVectorFactor());
16954 } else {
16955 VF = Entries.front()->getVectorFactor();
16956 }
16957 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
16958 for (const TreeEntry *E : Entries)
16959 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
16960 E->Scalars.end());
16961 // Update mapping between values and corresponding tree entries.
16962 for (auto &P : UsedValuesEntry) {
16963 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
16964 if (ValuesToEntries[Idx].contains(P.first)) {
16965 P.second = Idx;
16966 break;
16967 }
16968 }
16969 }
16970
16971 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
16972 // Checks if the 2 PHIs are compatible in terms of high possibility to be
16973 // vectorized.
16974 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
16975 auto *PHI = cast<PHINode>(V);
16976 auto *PHI1 = cast<PHINode>(V1);
16977 // Check that all incoming values are compatible/from same parent (if they
16978 // are instructions).
16979 // The incoming values are compatible if they all are constants, or
16980 // instruction with the same/alternate opcodes from the same basic block.
16981 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
16982 Value *In = PHI->getIncomingValue(I);
16983 Value *In1 = PHI1->getIncomingValue(I);
16984 if (isConstant(In) && isConstant(In1))
16985 continue;
16986 if (!getSameOpcode({In, In1}, *TLI))
16987 return false;
16988 if (cast<Instruction>(In)->getParent() !=
16990 return false;
16991 }
16992 return true;
16993 };
16994 // Check if the value can be ignored during analysis for shuffled gathers.
16995 // We suppose it is better to ignore instruction, which do not form splats,
16996 // are not vectorized/not extractelements (these instructions will be handled
16997 // by extractelements processing) or may form vector node in future.
16998 auto MightBeIgnored = [=](Value *V) {
16999 auto *I = dyn_cast<Instruction>(V);
17000 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17002 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17003 };
17004 // Check that the neighbor instruction may form a full vector node with the
17005 // current instruction V. It is possible, if they have same/alternate opcode
17006 // and same parent basic block.
17007 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17008 Value *V1 = VL[Idx];
17009 bool UsedInSameVTE = false;
17010 auto It = UsedValuesEntry.find(V1);
17011 if (It != UsedValuesEntry.end())
17012 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17013 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17014 getSameOpcode({V, V1}, *TLI) &&
17015 cast<Instruction>(V)->getParent() ==
17016 cast<Instruction>(V1)->getParent() &&
17017 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17018 };
17019 // Build a shuffle mask for better cost estimation and vector emission.
17020 SmallBitVector UsedIdxs(Entries.size());
17022 for (int I = 0, E = VL.size(); I < E; ++I) {
17023 Value *V = VL[I];
17024 auto It = UsedValuesEntry.find(V);
17025 if (It == UsedValuesEntry.end())
17026 continue;
17027 // Do not try to shuffle scalars, if they are constants, or instructions
17028 // that can be vectorized as a result of the following vector build
17029 // vectorization.
17030 if (isConstant(V) || (MightBeIgnored(V) &&
17031 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17032 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17033 continue;
17034 unsigned Idx = It->second;
17035 EntryLanes.emplace_back(Idx, I);
17036 UsedIdxs.set(Idx);
17037 }
17038 // Iterate through all shuffled scalars and select entries, which can be used
17039 // for final shuffle.
17041 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17042 if (!UsedIdxs.test(I))
17043 continue;
17044 // Fix the entry number for the given scalar. If it is the first entry, set
17045 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17046 // These indices are used when calculating final shuffle mask as the vector
17047 // offset.
17048 for (std::pair<unsigned, int> &Pair : EntryLanes)
17049 if (Pair.first == I)
17050 Pair.first = TempEntries.size();
17051 TempEntries.push_back(Entries[I]);
17052 }
17053 Entries.swap(TempEntries);
17054 if (EntryLanes.size() == Entries.size() &&
17055 !VL.equals(ArrayRef(TE->Scalars)
17056 .slice(Part * VL.size(),
17057 std::min<int>(VL.size(), TE->Scalars.size())))) {
17058 // We may have here 1 or 2 entries only. If the number of scalars is equal
17059 // to the number of entries, no need to do the analysis, it is not very
17060 // profitable. Since VL is not the same as TE->Scalars, it means we already
17061 // have some shuffles before. Cut off not profitable case.
17062 Entries.clear();
17063 return std::nullopt;
17064 }
17065 // Build the final mask, check for the identity shuffle, if possible.
17066 bool IsIdentity = Entries.size() == 1;
17067 // Pair.first is the offset to the vector, while Pair.second is the index of
17068 // scalar in the list.
17069 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17070 unsigned Idx = Part * VL.size() + Pair.second;
17071 Mask[Idx] =
17072 Pair.first * VF +
17073 (ForOrder ? std::distance(
17074 Entries[Pair.first]->Scalars.begin(),
17075 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17076 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17077 IsIdentity &= Mask[Idx] == Pair.second;
17078 }
17079 if (ForOrder || IsIdentity || Entries.empty()) {
17080 switch (Entries.size()) {
17081 case 1:
17082 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17084 break;
17085 case 2:
17086 if (EntryLanes.size() > 2 || VL.size() <= 2)
17088 break;
17089 default:
17090 break;
17091 }
17092 } else if (!isa<VectorType>(VL.front()->getType()) &&
17093 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17094 // Do the cost estimation if shuffle beneficial than buildvector.
17095 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17096 std::next(Mask.begin(), (Part + 1) * VL.size()));
17097 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17098 for (int Idx : SubMask) {
17099 if (Idx == PoisonMaskElem)
17100 continue;
17101 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17102 MinElement = Idx;
17103 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17104 MaxElement = Idx;
17105 }
17106 assert(MaxElement >= 0 && MinElement >= 0 &&
17107 MaxElement % VF >= MinElement % VF &&
17108 "Expected at least single element.");
17109 unsigned NewVF = std::max<unsigned>(
17110 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17111 (MaxElement % VF) -
17112 (MinElement % VF) + 1));
17113 if (NewVF < VF) {
17114 for (int &Idx : SubMask) {
17115 if (Idx == PoisonMaskElem)
17116 continue;
17117 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17118 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17119 }
17120 } else {
17121 NewVF = VF;
17122 }
17123
17125 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17126 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17127 auto GetShuffleCost = [&,
17128 &TTI = *TTI](ArrayRef<int> Mask,
17130 VectorType *VecTy) -> InstructionCost {
17131 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17133 Mask, Entries.front()->getInterleaveFactor()))
17134 return TTI::TCC_Free;
17135 return ::getShuffleCost(TTI,
17136 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17138 VecTy, Mask, CostKind);
17139 };
17140 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17141 InstructionCost FirstShuffleCost = 0;
17142 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17143 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17144 FirstShuffleCost = ShuffleCost;
17145 } else {
17146 // Transform mask to include only first entry.
17147 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17148 bool IsIdentity = true;
17149 for (auto [I, Idx] : enumerate(FirstMask)) {
17150 if (Idx >= static_cast<int>(NewVF)) {
17151 Idx = PoisonMaskElem;
17152 } else {
17153 DemandedElts.clearBit(I);
17154 if (Idx != PoisonMaskElem)
17155 IsIdentity &= static_cast<int>(I) == Idx;
17156 }
17157 }
17158 if (!IsIdentity)
17159 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17160 FirstShuffleCost += getScalarizationOverhead(
17161 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17162 /*Extract=*/false, CostKind);
17163 }
17164 InstructionCost SecondShuffleCost = 0;
17165 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17166 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17167 SecondShuffleCost = ShuffleCost;
17168 } else {
17169 // Transform mask to include only first entry.
17170 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17171 bool IsIdentity = true;
17172 for (auto [I, Idx] : enumerate(SecondMask)) {
17173 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
17174 Idx = PoisonMaskElem;
17175 } else {
17176 DemandedElts.clearBit(I);
17177 if (Idx != PoisonMaskElem) {
17178 Idx -= NewVF;
17179 IsIdentity &= static_cast<int>(I) == Idx;
17180 }
17181 }
17182 }
17183 if (!IsIdentity)
17184 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17185 SecondShuffleCost += getScalarizationOverhead(
17186 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17187 /*Extract=*/false, CostKind);
17188 }
17189 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17190 for (auto [I, Idx] : enumerate(SubMask))
17191 if (Idx == PoisonMaskElem)
17192 DemandedElts.clearBit(I);
17193 InstructionCost BuildVectorCost = getScalarizationOverhead(
17194 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17195 /*Extract=*/false, CostKind);
17196 const TreeEntry *BestEntry = nullptr;
17197 if (FirstShuffleCost < ShuffleCost) {
17198 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17199 std::next(Mask.begin(), (Part + 1) * VL.size()),
17200 [&](int &Idx) {
17201 if (Idx >= static_cast<int>(VF))
17202 Idx = PoisonMaskElem;
17203 });
17204 BestEntry = Entries.front();
17205 ShuffleCost = FirstShuffleCost;
17206 }
17207 if (SecondShuffleCost < ShuffleCost) {
17208 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17209 std::next(Mask.begin(), (Part + 1) * VL.size()),
17210 [&](int &Idx) {
17211 if (Idx < static_cast<int>(VF))
17212 Idx = PoisonMaskElem;
17213 else
17214 Idx -= VF;
17215 });
17216 BestEntry = Entries[1];
17217 ShuffleCost = SecondShuffleCost;
17218 }
17219 if (BuildVectorCost >= ShuffleCost) {
17220 if (BestEntry) {
17221 Entries.clear();
17222 Entries.push_back(BestEntry);
17223 }
17224 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
17226 }
17227 }
17228 Entries.clear();
17229 // Clear the corresponding mask elements.
17230 std::fill(std::next(Mask.begin(), Part * VL.size()),
17231 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
17232 return std::nullopt;
17233}
17234
17236BoUpSLP::isGatherShuffledEntry(
17237 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
17238 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
17239 bool ForOrder) {
17240 assert(NumParts > 0 && NumParts < VL.size() &&
17241 "Expected positive number of registers.");
17242 Entries.clear();
17243 // No need to check for the topmost gather node.
17244 if (TE == VectorizableTree.front().get() &&
17245 (!GatheredLoadsEntriesFirst.has_value() ||
17246 none_of(ArrayRef(VectorizableTree).drop_front(),
17247 [](const std::unique_ptr<TreeEntry> &TE) {
17248 return !TE->isGather();
17249 })))
17250 return {};
17251 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
17252 // implemented yet.
17253 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17254 return {};
17255 Mask.assign(VL.size(), PoisonMaskElem);
17256 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17257 "Expected only single user of the gather node.");
17258 assert(VL.size() % NumParts == 0 &&
17259 "Number of scalars must be divisible by NumParts.");
17260 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
17261 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17262 (TE->Idx == 0 ||
17263 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
17264 isSplat(TE->Scalars) ||
17265 (TE->hasState() &&
17266 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
17267 return {};
17268 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17270 for (unsigned Part : seq<unsigned>(NumParts)) {
17271 ArrayRef<Value *> SubVL =
17272 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17273 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17274 std::optional<TTI::ShuffleKind> SubRes =
17275 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17276 ForOrder);
17277 if (!SubRes)
17278 SubEntries.clear();
17279 Res.push_back(SubRes);
17280 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
17281 SubEntries.front()->getVectorFactor() == VL.size() &&
17282 (SubEntries.front()->isSame(TE->Scalars) ||
17283 SubEntries.front()->isSame(VL))) {
17284 SmallVector<const TreeEntry *> LocalSubEntries;
17285 LocalSubEntries.swap(SubEntries);
17286 Entries.clear();
17287 Res.clear();
17288 std::iota(Mask.begin(), Mask.end(), 0);
17289 // Clear undef scalars.
17290 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
17291 if (isa<PoisonValue>(VL[I]))
17293 Entries.emplace_back(1, LocalSubEntries.front());
17295 return Res;
17296 }
17297 }
17298 if (all_of(Res,
17299 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
17300 Entries.clear();
17301 return {};
17302 }
17303 return Res;
17304}
17305
17306InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
17307 Type *ScalarTy) const {
17308 const unsigned VF = VL.size();
17309 auto *VecTy = getWidenedType(ScalarTy, VF);
17310 // Find the cost of inserting/extracting values from the vector.
17311 // Check if the same elements are inserted several times and count them as
17312 // shuffle candidates.
17313 APInt DemandedElements = APInt::getZero(VF);
17316 auto EstimateInsertCost = [&](unsigned I, Value *V) {
17317 DemandedElements.setBit(I);
17318 if (V->getType() != ScalarTy)
17319 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
17321 };
17322 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
17323 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17324 for (auto [I, V] : enumerate(VL)) {
17325 // No need to shuffle duplicates for constants.
17326 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
17327 continue;
17328
17329 if (isConstant(V)) {
17330 ConstantShuffleMask[I] = I + VF;
17331 continue;
17332 }
17333 EstimateInsertCost(I, V);
17334 }
17335 // FIXME: add a cost for constant vector materialization.
17336 bool IsAnyNonUndefConst =
17337 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
17338 // 1. Shuffle input source vector and constant vector.
17339 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17341 ConstantShuffleMask);
17342 }
17343
17344 // 2. Insert unique non-constants.
17345 if (!DemandedElements.isZero())
17346 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
17347 /*Insert=*/true,
17348 /*Extract=*/false, CostKind,
17349 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17350 return Cost;
17351}
17352
17353Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
17354 auto It = EntryToLastInstruction.find(E);
17355 if (It != EntryToLastInstruction.end())
17356 return *cast<Instruction>(It->second);
17357 Instruction *Res = nullptr;
17358 // Get the basic block this bundle is in. All instructions in the bundle
17359 // should be in this block (except for extractelement-like instructions with
17360 // constant indices or gathered loads or copyables).
17361 Instruction *Front;
17362 unsigned Opcode;
17363 if (E->hasState()) {
17364 Front = E->getMainOp();
17365 Opcode = E->getOpcode();
17366 } else {
17367 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
17368 Opcode = Front->getOpcode();
17369 }
17370 auto *BB = Front->getParent();
17371 assert(
17372 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17373 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17374 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17375 all_of(E->Scalars,
17376 [=](Value *V) -> bool {
17377 if (Opcode == Instruction::GetElementPtr &&
17378 !isa<GetElementPtrInst>(V))
17379 return true;
17380 auto *I = dyn_cast<Instruction>(V);
17381 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17382 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17383 })) &&
17384 "Expected gathered loads or GEPs or instructions from same basic "
17385 "block.");
17386
17387 auto FindLastInst = [&]() {
17388 Instruction *LastInst = Front;
17389 for (Value *V : E->Scalars) {
17390 auto *I = dyn_cast<Instruction>(V);
17391 if (!I)
17392 continue;
17393 if (E->isCopyableElement(I))
17394 continue;
17395 if (LastInst->getParent() == I->getParent()) {
17396 if (LastInst->comesBefore(I))
17397 LastInst = I;
17398 continue;
17399 }
17400 assert(((Opcode == Instruction::GetElementPtr &&
17402 E->State == TreeEntry::SplitVectorize ||
17403 (isVectorLikeInstWithConstOps(LastInst) &&
17405 (GatheredLoadsEntriesFirst.has_value() &&
17406 Opcode == Instruction::Load && E->isGather() &&
17407 E->Idx < *GatheredLoadsEntriesFirst)) &&
17408 "Expected vector-like or non-GEP in GEP node insts only.");
17409 if (!DT->isReachableFromEntry(LastInst->getParent())) {
17410 LastInst = I;
17411 continue;
17412 }
17413 if (!DT->isReachableFromEntry(I->getParent()))
17414 continue;
17415 auto *NodeA = DT->getNode(LastInst->getParent());
17416 auto *NodeB = DT->getNode(I->getParent());
17417 assert(NodeA && "Should only process reachable instructions");
17418 assert(NodeB && "Should only process reachable instructions");
17419 assert((NodeA == NodeB) ==
17420 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17421 "Different nodes should have different DFS numbers");
17422 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17423 LastInst = I;
17424 }
17425 BB = LastInst->getParent();
17426 return LastInst;
17427 };
17428
17429 auto FindFirstInst = [&]() {
17430 Instruction *FirstInst = Front;
17431 for (Value *V : E->Scalars) {
17432 auto *I = dyn_cast<Instruction>(V);
17433 if (!I)
17434 continue;
17435 if (E->isCopyableElement(I))
17436 continue;
17437 if (FirstInst->getParent() == I->getParent()) {
17438 if (I->comesBefore(FirstInst))
17439 FirstInst = I;
17440 continue;
17441 }
17442 assert(((Opcode == Instruction::GetElementPtr &&
17444 (isVectorLikeInstWithConstOps(FirstInst) &&
17446 "Expected vector-like or non-GEP in GEP node insts only.");
17447 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
17448 FirstInst = I;
17449 continue;
17450 }
17451 if (!DT->isReachableFromEntry(I->getParent()))
17452 continue;
17453 auto *NodeA = DT->getNode(FirstInst->getParent());
17454 auto *NodeB = DT->getNode(I->getParent());
17455 assert(NodeA && "Should only process reachable instructions");
17456 assert(NodeB && "Should only process reachable instructions");
17457 assert((NodeA == NodeB) ==
17458 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17459 "Different nodes should have different DFS numbers");
17460 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17461 FirstInst = I;
17462 }
17463 return FirstInst;
17464 };
17465
17466 if (E->State == TreeEntry::SplitVectorize) {
17467 Res = FindLastInst();
17468 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
17469 for (auto *E : Entries) {
17470 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17471 if (!I)
17472 I = &getLastInstructionInBundle(E);
17473 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
17474 Res = I;
17475 }
17476 }
17477 EntryToLastInstruction.try_emplace(E, Res);
17478 return *Res;
17479 }
17480
17481 // Set insertpoint for gathered loads to the very first load.
17482 if (GatheredLoadsEntriesFirst.has_value() &&
17483 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17484 Opcode == Instruction::Load) {
17485 Res = FindFirstInst();
17486 EntryToLastInstruction.try_emplace(E, Res);
17487 return *Res;
17488 }
17489
17490 // Set the insert point to the beginning of the basic block if the entry
17491 // should not be scheduled.
17492 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
17493 if (E->isGather())
17494 return nullptr;
17495 // Found previously that the instruction do not need to be scheduled.
17496 const auto *It = BlocksSchedules.find(BB);
17497 if (It == BlocksSchedules.end())
17498 return nullptr;
17499 for (Value *V : E->Scalars) {
17500 auto *I = dyn_cast<Instruction>(V);
17501 if (!I || isa<PHINode>(I) ||
17502 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
17503 continue;
17504 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
17505 if (Bundles.empty())
17506 continue;
17507 const auto *It = find_if(
17508 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
17509 if (It != Bundles.end())
17510 return *It;
17511 }
17512 return nullptr;
17513 };
17514 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17515 if (!E->isGather() && !Bundle) {
17516 if ((Opcode == Instruction::GetElementPtr &&
17517 any_of(E->Scalars,
17518 [](Value *V) {
17519 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17520 })) ||
17521 all_of(E->Scalars, [&](Value *V) {
17522 return isa<PoisonValue>(V) || E->isCopyableElement(V) ||
17523 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17524 }))
17525 Res = FindLastInst();
17526 else
17527 Res = FindFirstInst();
17528 EntryToLastInstruction.try_emplace(E, Res);
17529 return *Res;
17530 }
17531
17532 // Find the last instruction. The common case should be that BB has been
17533 // scheduled, and the last instruction is VL.back(). So we start with
17534 // VL.back() and iterate over schedule data until we reach the end of the
17535 // bundle. The end of the bundle is marked by null ScheduleData.
17536 if (Bundle) {
17537 assert(!E->isGather() && "Gathered instructions should not be scheduled");
17538 Res = Bundle->getBundle().back()->getInst();
17539 EntryToLastInstruction.try_emplace(E, Res);
17540 return *Res;
17541 }
17542
17543 // LastInst can still be null at this point if there's either not an entry
17544 // for BB in BlocksSchedules or there's no ScheduleData available for
17545 // VL.back(). This can be the case if buildTreeRec aborts for various
17546 // reasons (e.g., the maximum recursion depth is reached, the maximum region
17547 // size is reached, etc.). ScheduleData is initialized in the scheduling
17548 // "dry-run".
17549 //
17550 // If this happens, we can still find the last instruction by brute force. We
17551 // iterate forwards from Front (inclusive) until we either see all
17552 // instructions in the bundle or reach the end of the block. If Front is the
17553 // last instruction in program order, LastInst will be set to Front, and we
17554 // will visit all the remaining instructions in the block.
17555 //
17556 // One of the reasons we exit early from buildTreeRec is to place an upper
17557 // bound on compile-time. Thus, taking an additional compile-time hit here is
17558 // not ideal. However, this should be exceedingly rare since it requires that
17559 // we both exit early from buildTreeRec and that the bundle be out-of-order
17560 // (causing us to iterate all the way to the end of the block).
17561 if (!Res)
17562 Res = FindLastInst();
17563 assert(Res && "Failed to find last instruction in bundle");
17564 EntryToLastInstruction.try_emplace(E, Res);
17565 return *Res;
17566}
17567
17568void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
17569 auto *Front = E->getMainOp();
17570 Instruction *LastInst = &getLastInstructionInBundle(E);
17571 assert(LastInst && "Failed to find last instruction in bundle");
17572 BasicBlock::iterator LastInstIt = LastInst->getIterator();
17573 // If the instruction is PHI, set the insert point after all the PHIs.
17574 bool IsPHI = isa<PHINode>(LastInst);
17575 if (IsPHI) {
17576 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
17577 if (LastInstIt != LastInst->getParent()->end() &&
17578 LastInstIt->getParent()->isLandingPad())
17579 LastInstIt = std::next(LastInstIt);
17580 }
17581 if (IsPHI ||
17582 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17583 E->doesNotNeedToSchedule()) ||
17584 (GatheredLoadsEntriesFirst.has_value() &&
17585 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17586 E->getOpcode() == Instruction::Load)) {
17587 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
17588 } else {
17589 // Set the insertion point after the last instruction in the bundle. Set the
17590 // debug location to Front.
17591 Builder.SetInsertPoint(
17592 LastInst->getParent(),
17593 LastInst->getNextNode()->getIterator());
17594 }
17595 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
17596}
17597
17598Value *BoUpSLP::gather(
17599 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
17600 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
17601 // List of instructions/lanes from current block and/or the blocks which are
17602 // part of the current loop. These instructions will be inserted at the end to
17603 // make it possible to optimize loops and hoist invariant instructions out of
17604 // the loops body with better chances for success.
17606 SmallSet<int, 4> PostponedIndices;
17607 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
17608 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
17609 SmallPtrSet<BasicBlock *, 4> Visited;
17610 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
17611 InsertBB = InsertBB->getSinglePredecessor();
17612 return InsertBB && InsertBB == InstBB;
17613 };
17614 for (int I = 0, E = VL.size(); I < E; ++I) {
17615 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
17616 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17617 isVectorized(Inst) ||
17618 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
17619 PostponedIndices.insert(I).second)
17620 PostponedInsts.emplace_back(Inst, I);
17621 }
17622
17623 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
17624 Type *Ty) {
17625 Value *Scalar = V;
17626 if (Scalar->getType() != Ty) {
17627 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17628 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
17629 Value *V = Scalar;
17630 if (auto *CI = dyn_cast<CastInst>(Scalar);
17632 Value *Op = CI->getOperand(0);
17633 if (auto *IOp = dyn_cast<Instruction>(Op);
17634 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
17635 V = Op;
17636 }
17637 Scalar = Builder.CreateIntCast(
17638 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
17639 }
17640
17641 Instruction *InsElt;
17642 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17643 assert(SLPReVec && "FixedVectorType is not expected.");
17644 Vec =
17645 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
17646 auto *II = dyn_cast<Instruction>(Vec);
17647 if (!II)
17648 return Vec;
17649 InsElt = II;
17650 } else {
17651 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17652 InsElt = dyn_cast<InsertElementInst>(Vec);
17653 if (!InsElt)
17654 return Vec;
17655 }
17656 GatherShuffleExtractSeq.insert(InsElt);
17657 CSEBlocks.insert(InsElt->getParent());
17658 // Add to our 'need-to-extract' list.
17659 if (isa<Instruction>(V)) {
17660 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
17661 // Find which lane we need to extract.
17662 User *UserOp = nullptr;
17663 if (Scalar != V) {
17664 if (auto *SI = dyn_cast<Instruction>(Scalar))
17665 UserOp = SI;
17666 } else {
17667 if (V->getType()->isVectorTy()) {
17668 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
17669 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17670 // Find shufflevector, caused by resize.
17671 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
17672 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
17673 if (SV->getOperand(0) == V)
17674 return SV;
17675 if (SV->getOperand(1) == V)
17676 return SV;
17677 }
17678 return nullptr;
17679 };
17680 InsElt = nullptr;
17681 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17682 InsElt = User;
17683 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17684 InsElt = User;
17685 assert(InsElt &&
17686 "Failed to find shufflevector, caused by resize.");
17687 }
17688 }
17689 UserOp = InsElt;
17690 }
17691 if (UserOp) {
17692 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17693 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17694 }
17695 }
17696 }
17697 return Vec;
17698 };
17699 auto *VecTy = getWidenedType(ScalarTy, VL.size());
17700 Value *Vec = PoisonValue::get(VecTy);
17701 SmallVector<int> NonConsts;
17702 SmallVector<int> Mask(VL.size());
17703 std::iota(Mask.begin(), Mask.end(), 0);
17704 Value *OriginalRoot = Root;
17705 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
17706 SV && isa<PoisonValue>(SV->getOperand(1)) &&
17707 SV->getOperand(0)->getType() == VecTy) {
17708 Root = SV->getOperand(0);
17709 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17710 }
17711 // Insert constant values at first.
17712 for (int I = 0, E = VL.size(); I < E; ++I) {
17713 if (PostponedIndices.contains(I))
17714 continue;
17715 if (!isConstant(VL[I])) {
17716 NonConsts.push_back(I);
17717 continue;
17718 }
17719 if (isa<PoisonValue>(VL[I]))
17720 continue;
17721 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17722 Mask[I] = I + E;
17723 }
17724 if (Root) {
17725 if (isa<PoisonValue>(Vec)) {
17726 Vec = OriginalRoot;
17727 } else {
17728 Vec = CreateShuffle(Root, Vec, Mask);
17729 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
17730 OI && OI->use_empty() &&
17731 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
17732 return TE->VectorizedValue == OI;
17733 }))
17734 eraseInstruction(OI);
17735 }
17736 }
17737 // Insert non-constant values.
17738 for (int I : NonConsts)
17739 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17740 // Append instructions, which are/may be part of the loop, in the end to make
17741 // it possible to hoist non-loop-based instructions.
17742 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17743 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17744
17745 return Vec;
17746}
17747
17748/// Merges shuffle masks and emits final shuffle instruction, if required. It
17749/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
17750/// when the actual shuffle instruction is generated only if this is actually
17751/// required. Otherwise, the shuffle instruction emission is delayed till the
17752/// end of the process, to reduce the number of emitted instructions and further
17753/// analysis/transformations.
17754/// The class also will look through the previously emitted shuffle instructions
17755/// and properly mark indices in mask as undef.
17756/// For example, given the code
17757/// \code
17758/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
17759/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
17760/// \endcode
17761/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
17762/// look through %s1 and %s2 and emit
17763/// \code
17764/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17765/// \endcode
17766/// instead.
17767/// If 2 operands are of different size, the smallest one will be resized and
17768/// the mask recalculated properly.
17769/// For example, given the code
17770/// \code
17771/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
17772/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
17773/// \endcode
17774/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
17775/// look through %s1 and %s2 and emit
17776/// \code
17777/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17778/// \endcode
17779/// instead.
17780class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
17781 bool IsFinalized = false;
17782 /// Combined mask for all applied operands and masks. It is built during
17783 /// analysis and actual emission of shuffle vector instructions.
17784 SmallVector<int> CommonMask;
17785 /// List of operands for the shuffle vector instruction. It hold at max 2
17786 /// operands, if the 3rd is going to be added, the first 2 are combined into
17787 /// shuffle with \p CommonMask mask, the first operand sets to be the
17788 /// resulting shuffle and the second operand sets to be the newly added
17789 /// operand. The \p CommonMask is transformed in the proper way after that.
17790 SmallVector<Value *, 2> InVectors;
17791 IRBuilderBase &Builder;
17792 BoUpSLP &R;
17793
17794 class ShuffleIRBuilder {
17795 IRBuilderBase &Builder;
17796 /// Holds all of the instructions that we gathered.
17797 SetVector<Instruction *> &GatherShuffleExtractSeq;
17798 /// A list of blocks that we are going to CSE.
17799 DenseSet<BasicBlock *> &CSEBlocks;
17800 /// Data layout.
17801 const DataLayout &DL;
17802
17803 public:
17804 ShuffleIRBuilder(IRBuilderBase &Builder,
17805 SetVector<Instruction *> &GatherShuffleExtractSeq,
17806 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
17807 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17808 CSEBlocks(CSEBlocks), DL(DL) {}
17809 ~ShuffleIRBuilder() = default;
17810 /// Creates shufflevector for the 2 operands with the given mask.
17811 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
17812 if (V1->getType() != V2->getType()) {
17814 V1->getType()->isIntOrIntVectorTy() &&
17815 "Expected integer vector types only.");
17816 if (V1->getType() != V2->getType()) {
17817 if (cast<VectorType>(V2->getType())
17818 ->getElementType()
17819 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
17820 ->getElementType()
17821 ->getIntegerBitWidth())
17822 V2 = Builder.CreateIntCast(
17823 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
17824 else
17825 V1 = Builder.CreateIntCast(
17826 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
17827 }
17828 }
17829 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17830 if (auto *I = dyn_cast<Instruction>(Vec)) {
17831 GatherShuffleExtractSeq.insert(I);
17832 CSEBlocks.insert(I->getParent());
17833 }
17834 return Vec;
17835 }
17836 /// Creates permutation of the single vector operand with the given mask, if
17837 /// it is not identity mask.
17838 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
17839 if (Mask.empty())
17840 return V1;
17841 unsigned VF = Mask.size();
17842 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
17843 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
17844 return V1;
17845 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17846 if (auto *I = dyn_cast<Instruction>(Vec)) {
17847 GatherShuffleExtractSeq.insert(I);
17848 CSEBlocks.insert(I->getParent());
17849 }
17850 return Vec;
17851 }
17852 Value *createIdentity(Value *V) { return V; }
17853 Value *createPoison(Type *Ty, unsigned VF) {
17854 return PoisonValue::get(getWidenedType(Ty, VF));
17855 }
17856 /// Resizes 2 input vector to match the sizes, if the they are not equal
17857 /// yet. The smallest vector is resized to the size of the larger vector.
17858 void resizeToMatch(Value *&V1, Value *&V2) {
17859 if (V1->getType() == V2->getType())
17860 return;
17861 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
17862 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
17863 int VF = std::max(V1VF, V2VF);
17864 int MinVF = std::min(V1VF, V2VF);
17865 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
17866 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
17867 0);
17868 Value *&Op = MinVF == V1VF ? V1 : V2;
17869 Op = Builder.CreateShuffleVector(Op, IdentityMask);
17870 if (auto *I = dyn_cast<Instruction>(Op)) {
17871 GatherShuffleExtractSeq.insert(I);
17872 CSEBlocks.insert(I->getParent());
17873 }
17874 if (MinVF == V1VF)
17875 V1 = Op;
17876 else
17877 V2 = Op;
17878 }
17879 };
17880
17881 /// Smart shuffle instruction emission, walks through shuffles trees and
17882 /// tries to find the best matching vector for the actual shuffle
17883 /// instruction.
17884 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
17885 assert(V1 && "Expected at least one vector value.");
17886 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17887 R.CSEBlocks, *R.DL);
17888 return BaseShuffleAnalysis::createShuffle<Value *>(
17889 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17890 }
17891
17892 /// Cast value \p V to the vector type with the same number of elements, but
17893 /// the base type \p ScalarTy.
17894 Value *castToScalarTyElem(Value *V,
17895 std::optional<bool> IsSigned = std::nullopt) {
17896 auto *VecTy = cast<VectorType>(V->getType());
17897 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
17898 if (VecTy->getElementType() == ScalarTy->getScalarType())
17899 return V;
17900 return Builder.CreateIntCast(
17901 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
17902 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
17903 }
17904
17905 Value *getVectorizedValue(const TreeEntry &E) {
17906 Value *Vec = E.VectorizedValue;
17907 if (!Vec->getType()->isIntOrIntVectorTy())
17908 return Vec;
17909 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
17910 return !isa<PoisonValue>(V) &&
17911 !isKnownNonNegative(
17912 V, SimplifyQuery(*R.DL));
17913 }));
17914 }
17915
17916public:
17918 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17919
17920 /// Adjusts extractelements after reusing them.
17921 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
17922 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17923 unsigned NumParts, bool &UseVecBaseAsInput) {
17924 UseVecBaseAsInput = false;
17925 SmallPtrSet<Value *, 4> UniqueBases;
17926 Value *VecBase = nullptr;
17927 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
17928 if (!E->ReorderIndices.empty()) {
17929 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
17930 E->ReorderIndices.end());
17931 reorderScalars(VL, ReorderMask);
17932 }
17933 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
17934 int Idx = Mask[I];
17935 if (Idx == PoisonMaskElem)
17936 continue;
17937 auto *EI = cast<ExtractElementInst>(VL[I]);
17938 VecBase = EI->getVectorOperand();
17939 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
17940 VecBase = TEs.front()->VectorizedValue;
17941 assert(VecBase && "Expected vectorized value.");
17942 UniqueBases.insert(VecBase);
17943 // If the only one use is vectorized - can delete the extractelement
17944 // itself.
17945 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17946 (NumParts != 1 && count(VL, EI) > 1) ||
17947 any_of(EI->users(), [&](User *U) {
17948 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17949 return UTEs.empty() || UTEs.size() > 1 ||
17950 (isa<GetElementPtrInst>(U) &&
17951 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17952 (!UTEs.empty() &&
17953 count_if(R.VectorizableTree,
17954 [&](const std::unique_ptr<TreeEntry> &TE) {
17955 return TE->UserTreeIndex.UserTE ==
17956 UTEs.front() &&
17957 is_contained(VL, EI);
17958 }) != 1);
17959 }))
17960 continue;
17961 R.eraseInstruction(EI);
17962 }
17963 if (NumParts == 1 || UniqueBases.size() == 1) {
17964 assert(VecBase && "Expected vectorized value.");
17965 return castToScalarTyElem(VecBase);
17966 }
17967 UseVecBaseAsInput = true;
17968 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
17969 for (auto [I, Idx] : enumerate(Mask))
17970 if (Idx != PoisonMaskElem)
17971 Idx = I;
17972 };
17973 // Perform multi-register vector shuffle, joining them into a single virtual
17974 // long vector.
17975 // Need to shuffle each part independently and then insert all this parts
17976 // into a long virtual vector register, forming the original vector.
17977 Value *Vec = nullptr;
17978 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
17979 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17980 for (unsigned Part : seq<unsigned>(NumParts)) {
17981 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
17982 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
17983 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
17984 constexpr int MaxBases = 2;
17985 SmallVector<Value *, MaxBases> Bases(MaxBases);
17986 auto VLMask = zip(SubVL, SubMask);
17987 const unsigned VF = std::accumulate(
17988 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
17989 if (std::get<1>(D) == PoisonMaskElem)
17990 return S;
17991 Value *VecOp =
17992 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
17993 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
17994 !TEs.empty())
17995 VecOp = TEs.front()->VectorizedValue;
17996 assert(VecOp && "Expected vectorized value.");
17997 const unsigned Size =
17998 cast<FixedVectorType>(VecOp->getType())->getNumElements();
17999 return std::max(S, Size);
18000 });
18001 for (const auto [V, I] : VLMask) {
18002 if (I == PoisonMaskElem)
18003 continue;
18004 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18005 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18006 VecOp = TEs.front()->VectorizedValue;
18007 assert(VecOp && "Expected vectorized value.");
18008 VecOp = castToScalarTyElem(VecOp);
18009 Bases[I / VF] = VecOp;
18010 }
18011 if (!Bases.front())
18012 continue;
18013 Value *SubVec;
18014 if (Bases.back()) {
18015 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18016 TransformToIdentity(SubMask);
18017 } else {
18018 SubVec = Bases.front();
18019 }
18020 if (!Vec) {
18021 Vec = SubVec;
18022 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18023 [&](unsigned P) {
18024 ArrayRef<int> SubMask =
18025 Mask.slice(P * SliceSize,
18026 getNumElems(Mask.size(),
18027 SliceSize, P));
18028 return all_of(SubMask, [](int Idx) {
18029 return Idx == PoisonMaskElem;
18030 });
18031 })) &&
18032 "Expected first part or all previous parts masked.");
18033 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18034 } else {
18035 unsigned NewVF =
18036 cast<FixedVectorType>(Vec->getType())->getNumElements();
18037 if (Vec->getType() != SubVec->getType()) {
18038 unsigned SubVecVF =
18039 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18040 NewVF = std::max(NewVF, SubVecVF);
18041 }
18042 // Adjust SubMask.
18043 for (int &Idx : SubMask)
18044 if (Idx != PoisonMaskElem)
18045 Idx += NewVF;
18046 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18047 Vec = createShuffle(Vec, SubVec, VecMask);
18048 TransformToIdentity(VecMask);
18049 }
18050 }
18051 copy(VecMask, Mask.begin());
18052 return Vec;
18053 }
18054 /// Checks if the specified entry \p E needs to be delayed because of its
18055 /// dependency nodes.
18056 std::optional<Value *>
18057 needToDelay(const TreeEntry *E,
18059 // No need to delay emission if all deps are ready.
18060 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18061 return all_of(
18062 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18063 }))
18064 return std::nullopt;
18065 // Postpone gather emission, will be emitted after the end of the
18066 // process to keep correct order.
18067 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18068 return Builder.CreateAlignedLoad(
18069 ResVecTy,
18070 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18071 MaybeAlign());
18072 }
18073 /// Reset the builder to handle perfect diamond match.
18075 IsFinalized = false;
18076 CommonMask.clear();
18077 InVectors.clear();
18078 }
18079 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18080 /// shuffling.
18081 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18082 Value *V1 = getVectorizedValue(E1);
18083 Value *V2 = getVectorizedValue(E2);
18084 add(V1, V2, Mask);
18085 }
18086 /// Adds single input vector (in form of tree entry) and the mask for its
18087 /// shuffling.
18088 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18089 Value *V1 = getVectorizedValue(E1);
18090 add(V1, Mask);
18091 }
18092 /// Adds 2 input vectors and the mask for their shuffling.
18093 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18094 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18097 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18098 V1 = castToScalarTyElem(V1);
18099 V2 = castToScalarTyElem(V2);
18100 if (InVectors.empty()) {
18101 InVectors.push_back(V1);
18102 InVectors.push_back(V2);
18103 CommonMask.assign(Mask.begin(), Mask.end());
18104 return;
18105 }
18106 Value *Vec = InVectors.front();
18107 if (InVectors.size() == 2) {
18108 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18109 transformMaskAfterShuffle(CommonMask, CommonMask);
18110 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18111 Mask.size()) {
18112 Vec = createShuffle(Vec, nullptr, CommonMask);
18113 transformMaskAfterShuffle(CommonMask, CommonMask);
18114 }
18115 V1 = createShuffle(V1, V2, Mask);
18116 unsigned VF = std::max(getVF(V1), getVF(Vec));
18117 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18118 if (Mask[Idx] != PoisonMaskElem)
18119 CommonMask[Idx] = Idx + VF;
18120 InVectors.front() = Vec;
18121 if (InVectors.size() == 2)
18122 InVectors.back() = V1;
18123 else
18124 InVectors.push_back(V1);
18125 }
18126 /// Adds another one input vector and the mask for the shuffling.
18127 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
18129 "castToScalarTyElem expects V1 to be FixedVectorType");
18130 V1 = castToScalarTyElem(V1);
18131 if (InVectors.empty()) {
18132 InVectors.push_back(V1);
18133 CommonMask.assign(Mask.begin(), Mask.end());
18134 return;
18135 }
18136 const auto *It = find(InVectors, V1);
18137 if (It == InVectors.end()) {
18138 if (InVectors.size() == 2 ||
18139 InVectors.front()->getType() != V1->getType()) {
18140 Value *V = InVectors.front();
18141 if (InVectors.size() == 2) {
18142 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18143 transformMaskAfterShuffle(CommonMask, CommonMask);
18144 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18145 CommonMask.size()) {
18146 V = createShuffle(InVectors.front(), nullptr, CommonMask);
18147 transformMaskAfterShuffle(CommonMask, CommonMask);
18148 }
18149 unsigned VF = std::max(CommonMask.size(), Mask.size());
18150 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18151 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
18152 CommonMask[Idx] = V->getType() != V1->getType()
18153 ? Idx + VF
18154 : Mask[Idx] + getVF(V1);
18155 if (V->getType() != V1->getType())
18156 V1 = createShuffle(V1, nullptr, Mask);
18157 InVectors.front() = V;
18158 if (InVectors.size() == 2)
18159 InVectors.back() = V1;
18160 else
18161 InVectors.push_back(V1);
18162 return;
18163 }
18164 // Check if second vector is required if the used elements are already
18165 // used from the first one.
18166 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18167 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
18168 InVectors.push_back(V1);
18169 break;
18170 }
18171 }
18172 unsigned VF = 0;
18173 for (Value *V : InVectors)
18174 VF = std::max(VF, getVF(V));
18175 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18176 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
18177 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18178 }
18179 /// Adds another one input vector and the mask for the shuffling.
18181 SmallVector<int> NewMask;
18182 inversePermutation(Order, NewMask);
18183 add(V1, NewMask);
18184 }
18185 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
18186 Value *Root = nullptr) {
18187 return R.gather(VL, Root, ScalarTy,
18188 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
18189 return createShuffle(V1, V2, Mask);
18190 });
18191 }
18192 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
18193 /// Finalize emission of the shuffles.
18194 /// \param Action the action (if any) to be performed before final applying of
18195 /// the \p ExtMask mask.
18197 ArrayRef<int> ExtMask,
18198 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18199 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
18202 Action = {}) {
18203 IsFinalized = true;
18204 if (Action) {
18205 Value *Vec = InVectors.front();
18206 if (InVectors.size() == 2) {
18207 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18208 InVectors.pop_back();
18209 } else {
18210 Vec = createShuffle(Vec, nullptr, CommonMask);
18211 }
18212 transformMaskAfterShuffle(CommonMask, CommonMask);
18213 assert(VF > 0 &&
18214 "Expected vector length for the final value before action.");
18215 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
18216 if (VecVF < VF) {
18217 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18218 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18219 Vec = createShuffle(Vec, nullptr, ResizeMask);
18220 }
18221 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
18222 return createShuffle(V1, V2, Mask);
18223 });
18224 InVectors.front() = Vec;
18225 }
18226 if (!SubVectors.empty()) {
18227 Value *Vec = InVectors.front();
18228 if (InVectors.size() == 2) {
18229 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18230 InVectors.pop_back();
18231 } else {
18232 Vec = createShuffle(Vec, nullptr, CommonMask);
18233 }
18234 transformMaskAfterShuffle(CommonMask, CommonMask);
18235 auto CreateSubVectors = [&](Value *Vec,
18236 SmallVectorImpl<int> &CommonMask) {
18237 for (auto [E, Idx] : SubVectors) {
18238 Value *V = getVectorizedValue(*E);
18239 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
18240 // Use scalar version of the SCalarType to correctly handle shuffles
18241 // for revectorization. The revectorization mode operates by the
18242 // vectors, but here we need to operate on the scalars, because the
18243 // masks were already transformed for the vector elements and we don't
18244 // need doing this transformation again.
18245 Type *OrigScalarTy = ScalarTy;
18246 ScalarTy = ScalarTy->getScalarType();
18247 Vec = createInsertVector(
18248 Builder, Vec, V, InsertionIndex,
18249 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
18250 _3));
18251 ScalarTy = OrigScalarTy;
18252 if (!CommonMask.empty()) {
18253 std::iota(std::next(CommonMask.begin(), Idx),
18254 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
18255 Idx);
18256 }
18257 }
18258 return Vec;
18259 };
18260 if (SubVectorsMask.empty()) {
18261 Vec = CreateSubVectors(Vec, CommonMask);
18262 } else {
18263 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
18264 copy(SubVectorsMask, SVMask.begin());
18265 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
18266 if (I2 != PoisonMaskElem) {
18267 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
18268 I1 = I2 + CommonMask.size();
18269 }
18270 }
18271 Value *InsertVec =
18272 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
18273 Vec = createShuffle(InsertVec, Vec, SVMask);
18274 transformMaskAfterShuffle(CommonMask, SVMask);
18275 }
18276 InVectors.front() = Vec;
18277 }
18278
18279 if (!ExtMask.empty()) {
18280 if (CommonMask.empty()) {
18281 CommonMask.assign(ExtMask.begin(), ExtMask.end());
18282 } else {
18283 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
18284 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
18285 if (ExtMask[I] == PoisonMaskElem)
18286 continue;
18287 NewMask[I] = CommonMask[ExtMask[I]];
18288 }
18289 CommonMask.swap(NewMask);
18290 }
18291 }
18292 if (CommonMask.empty()) {
18293 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
18294 return InVectors.front();
18295 }
18296 if (InVectors.size() == 2)
18297 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18298 return createShuffle(InVectors.front(), nullptr, CommonMask);
18299 }
18300
18302 assert((IsFinalized || CommonMask.empty()) &&
18303 "Shuffle construction must be finalized.");
18304 }
18305};
18306
18307Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
18308 return vectorizeTree(getOperandEntry(E, NodeIdx));
18309}
18310
18311template <typename BVTy, typename ResTy, typename... Args>
18312ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
18313 Args &...Params) {
18314 assert(E->isGather() && "Expected gather node.");
18315 unsigned VF = E->getVectorFactor();
18316
18317 bool NeedFreeze = false;
18318 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
18319 // Clear values, to be replaced by insertvector instructions.
18320 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18321 for_each(MutableArrayRef(GatheredScalars)
18322 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18323 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
18325 E->CombinedEntriesWithIndices.size());
18326 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18327 [&](const auto &P) {
18328 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18329 });
18330 // Build a mask out of the reorder indices and reorder scalars per this
18331 // mask.
18332 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18333 E->ReorderIndices.end());
18334 if (!ReorderMask.empty())
18335 reorderScalars(GatheredScalars, ReorderMask);
18336 SmallVector<int> SubVectorsMask;
18337 inversePermutation(E->ReorderIndices, SubVectorsMask);
18338 // Transform non-clustered elements in the mask to poison (-1).
18339 // "Clustered" operations will be reordered using this mask later.
18340 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
18341 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
18342 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
18343 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
18344 } else {
18345 SubVectorsMask.clear();
18346 }
18347 SmallVector<Value *> StoredGS(GatheredScalars);
18348 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
18349 unsigned I, unsigned SliceSize,
18350 bool IsNotPoisonous) {
18351 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
18352 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18353 }))
18354 return false;
18355 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18356 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18357 if (UserTE->getNumOperands() != 2)
18358 return false;
18359 if (!IsNotPoisonous) {
18360 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18361 [=](const std::unique_ptr<TreeEntry> &TE) {
18362 return TE->UserTreeIndex.UserTE == UserTE &&
18363 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18364 });
18365 if (It == VectorizableTree.end())
18366 return false;
18367 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
18368 if (!(*It)->ReorderIndices.empty()) {
18369 inversePermutation((*It)->ReorderIndices, ReorderMask);
18370 reorderScalars(GS, ReorderMask);
18371 }
18372 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
18373 Value *V0 = std::get<0>(P);
18374 Value *V1 = std::get<1>(P);
18375 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18376 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18377 is_contained(E->Scalars, V1));
18378 }))
18379 return false;
18380 }
18381 int Idx;
18382 if ((Mask.size() < InputVF &&
18383 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
18384 Idx == 0) ||
18385 (Mask.size() == InputVF &&
18386 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
18387 std::iota(
18388 std::next(Mask.begin(), I * SliceSize),
18389 std::next(Mask.begin(),
18390 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18391 0);
18392 } else {
18393 unsigned IVal =
18394 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
18395 std::fill(
18396 std::next(Mask.begin(), I * SliceSize),
18397 std::next(Mask.begin(),
18398 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18399 IVal);
18400 }
18401 return true;
18402 };
18403 BVTy ShuffleBuilder(ScalarTy, Params...);
18404 ResTy Res = ResTy();
18405 SmallVector<int> Mask;
18406 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
18408 Value *ExtractVecBase = nullptr;
18409 bool UseVecBaseAsInput = false;
18412 Type *OrigScalarTy = GatheredScalars.front()->getType();
18413 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
18414 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
18415 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
18416 // Check for gathered extracts.
18417 bool Resized = false;
18418 ExtractShuffles =
18419 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18420 if (!ExtractShuffles.empty()) {
18421 SmallVector<const TreeEntry *> ExtractEntries;
18422 for (auto [Idx, I] : enumerate(ExtractMask)) {
18423 if (I == PoisonMaskElem)
18424 continue;
18425 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
18426 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
18427 !TEs.empty())
18428 ExtractEntries.append(TEs.begin(), TEs.end());
18429 }
18430 if (std::optional<ResTy> Delayed =
18431 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18432 // Delay emission of gathers which are not ready yet.
18433 PostponedGathers.insert(E);
18434 // Postpone gather emission, will be emitted after the end of the
18435 // process to keep correct order.
18436 return *Delayed;
18437 }
18438 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
18439 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18440 ExtractVecBase = VecBase;
18441 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18442 if (VF == VecBaseTy->getNumElements() &&
18443 GatheredScalars.size() != VF) {
18444 Resized = true;
18445 GatheredScalars.append(VF - GatheredScalars.size(),
18446 PoisonValue::get(OrigScalarTy));
18447 NumParts =
18448 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
18449 }
18450 }
18451 }
18452 // Gather extracts after we check for full matched gathers only.
18453 if (!ExtractShuffles.empty() || !E->hasState() ||
18454 E->getOpcode() != Instruction::Load ||
18455 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18456 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18457 any_of(E->Scalars,
18458 [this](Value *V) {
18459 return isa<LoadInst>(V) && isVectorized(V);
18460 })) ||
18461 (E->hasState() && E->isAltShuffle()) ||
18462 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
18463 isSplat(E->Scalars) ||
18464 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18465 GatherShuffles =
18466 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18467 }
18468 if (!GatherShuffles.empty()) {
18469 if (std::optional<ResTy> Delayed =
18470 ShuffleBuilder.needToDelay(E, Entries)) {
18471 // Delay emission of gathers which are not ready yet.
18472 PostponedGathers.insert(E);
18473 // Postpone gather emission, will be emitted after the end of the
18474 // process to keep correct order.
18475 return *Delayed;
18476 }
18477 if (GatherShuffles.size() == 1 &&
18478 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
18479 Entries.front().front()->isSame(E->Scalars)) {
18480 // Perfect match in the graph, will reuse the previously vectorized
18481 // node. Cost is 0.
18482 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
18483 << shortBundleName(E->Scalars, E->Idx) << ".\n");
18484 // Restore the mask for previous partially matched values.
18485 Mask.resize(E->Scalars.size());
18486 const TreeEntry *FrontTE = Entries.front().front();
18487 if (FrontTE->ReorderIndices.empty() &&
18488 ((FrontTE->ReuseShuffleIndices.empty() &&
18489 E->Scalars.size() == FrontTE->Scalars.size()) ||
18490 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18491 std::iota(Mask.begin(), Mask.end(), 0);
18492 } else {
18493 for (auto [I, V] : enumerate(E->Scalars)) {
18494 if (isa<PoisonValue>(V)) {
18496 continue;
18497 }
18498 Mask[I] = FrontTE->findLaneForValue(V);
18499 }
18500 }
18501 // Reset the builder(s) to correctly handle perfect diamond matched
18502 // nodes.
18503 ShuffleBuilder.resetForSameNode();
18504 ShuffleBuilder.add(*FrontTE, Mask);
18505 // Full matched entry found, no need to insert subvectors.
18506 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18507 return Res;
18508 }
18509 if (!Resized) {
18510 if (GatheredScalars.size() != VF &&
18511 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
18512 return any_of(TEs, [&](const TreeEntry *TE) {
18513 return TE->getVectorFactor() == VF;
18514 });
18515 }))
18516 GatheredScalars.append(VF - GatheredScalars.size(),
18517 PoisonValue::get(OrigScalarTy));
18518 }
18519 // Remove shuffled elements from list of gathers.
18520 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18521 if (Mask[I] != PoisonMaskElem)
18522 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18523 }
18524 }
18525 }
18526 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18527 SmallVectorImpl<int> &ReuseMask,
18528 bool IsRootPoison) {
18529 // For splats with can emit broadcasts instead of gathers, so try to find
18530 // such sequences.
18531 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
18532 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
18533 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
18534 SmallVector<int> UndefPos;
18535 DenseMap<Value *, unsigned> UniquePositions;
18536 // Gather unique non-const values and all constant values.
18537 // For repeated values, just shuffle them.
18538 int NumNonConsts = 0;
18539 int SinglePos = 0;
18540 for (auto [I, V] : enumerate(Scalars)) {
18541 if (isa<UndefValue>(V)) {
18542 if (!isa<PoisonValue>(V)) {
18543 ReuseMask[I] = I;
18544 UndefPos.push_back(I);
18545 }
18546 continue;
18547 }
18548 if (isConstant(V)) {
18549 ReuseMask[I] = I;
18550 continue;
18551 }
18552 ++NumNonConsts;
18553 SinglePos = I;
18554 Value *OrigV = V;
18555 Scalars[I] = PoisonValue::get(OrigScalarTy);
18556 if (IsSplat) {
18557 Scalars.front() = OrigV;
18558 ReuseMask[I] = 0;
18559 } else {
18560 const auto Res = UniquePositions.try_emplace(OrigV, I);
18561 Scalars[Res.first->second] = OrigV;
18562 ReuseMask[I] = Res.first->second;
18563 }
18564 }
18565 if (NumNonConsts == 1) {
18566 // Restore single insert element.
18567 if (IsSplat) {
18568 ReuseMask.assign(VF, PoisonMaskElem);
18569 std::swap(Scalars.front(), Scalars[SinglePos]);
18570 if (!UndefPos.empty() && UndefPos.front() == 0)
18571 Scalars.front() = UndefValue::get(OrigScalarTy);
18572 }
18573 ReuseMask[SinglePos] = SinglePos;
18574 } else if (!UndefPos.empty() && IsSplat) {
18575 // For undef values, try to replace them with the simple broadcast.
18576 // We can do it if the broadcasted value is guaranteed to be
18577 // non-poisonous, or by freezing the incoming scalar value first.
18578 auto *It = find_if(Scalars, [this, E](Value *V) {
18579 return !isa<UndefValue>(V) &&
18581 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
18582 // Check if the value already used in the same operation in
18583 // one of the nodes already.
18584 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18585 is_contained(E->UserTreeIndex.UserTE->Scalars,
18586 U.getUser());
18587 })));
18588 });
18589 if (It != Scalars.end()) {
18590 // Replace undefs by the non-poisoned scalars and emit broadcast.
18591 int Pos = std::distance(Scalars.begin(), It);
18592 for (int I : UndefPos) {
18593 // Set the undef position to the non-poisoned scalar.
18594 ReuseMask[I] = Pos;
18595 // Replace the undef by the poison, in the mask it is replaced by
18596 // non-poisoned scalar already.
18597 if (I != Pos)
18598 Scalars[I] = PoisonValue::get(OrigScalarTy);
18599 }
18600 } else {
18601 // Replace undefs by the poisons, emit broadcast and then emit
18602 // freeze.
18603 for (int I : UndefPos) {
18604 ReuseMask[I] = PoisonMaskElem;
18605 if (isa<UndefValue>(Scalars[I]))
18606 Scalars[I] = PoisonValue::get(OrigScalarTy);
18607 }
18608 NeedFreeze = true;
18609 }
18610 }
18611 };
18612 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
18613 bool IsNonPoisoned = true;
18614 bool IsUsedInExpr = true;
18615 Value *Vec1 = nullptr;
18616 if (!ExtractShuffles.empty()) {
18617 // Gather of extractelements can be represented as just a shuffle of
18618 // a single/two vectors the scalars are extracted from.
18619 // Find input vectors.
18620 Value *Vec2 = nullptr;
18621 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18622 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
18623 ExtractMask[I] = PoisonMaskElem;
18624 }
18625 if (UseVecBaseAsInput) {
18626 Vec1 = ExtractVecBase;
18627 } else {
18628 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18629 if (ExtractMask[I] == PoisonMaskElem)
18630 continue;
18631 if (isa<UndefValue>(StoredGS[I]))
18632 continue;
18633 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
18634 Value *VecOp = EI->getVectorOperand();
18635 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
18636 !TEs.empty() && TEs.front()->VectorizedValue)
18637 VecOp = TEs.front()->VectorizedValue;
18638 if (!Vec1) {
18639 Vec1 = VecOp;
18640 } else if (Vec1 != VecOp) {
18641 assert((!Vec2 || Vec2 == VecOp) &&
18642 "Expected only 1 or 2 vectors shuffle.");
18643 Vec2 = VecOp;
18644 }
18645 }
18646 }
18647 if (Vec2) {
18648 IsUsedInExpr = false;
18649 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
18650 isGuaranteedNotToBePoison(Vec2, AC);
18651 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18652 } else if (Vec1) {
18653 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
18654 IsUsedInExpr &= FindReusedSplat(
18655 ExtractMask,
18656 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
18657 ExtractMask.size(), IsNotPoisonedVec);
18658 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
18659 IsNonPoisoned &= IsNotPoisonedVec;
18660 } else {
18661 IsUsedInExpr = false;
18662 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
18663 /*ForExtracts=*/true);
18664 }
18665 }
18666 if (!GatherShuffles.empty()) {
18667 unsigned SliceSize =
18668 getPartNumElems(E->Scalars.size(),
18669 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
18670 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18671 for (const auto [I, TEs] : enumerate(Entries)) {
18672 if (TEs.empty()) {
18673 assert(!GatherShuffles[I] &&
18674 "No shuffles with empty entries list expected.");
18675 continue;
18676 }
18677 assert((TEs.size() == 1 || TEs.size() == 2) &&
18678 "Expected shuffle of 1 or 2 entries.");
18679 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
18680 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
18681 VecMask.assign(VecMask.size(), PoisonMaskElem);
18682 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
18683 if (TEs.size() == 1) {
18684 bool IsNotPoisonedVec =
18685 TEs.front()->VectorizedValue
18686 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
18687 : true;
18688 IsUsedInExpr &=
18689 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
18690 SliceSize, IsNotPoisonedVec);
18691 ShuffleBuilder.add(*TEs.front(), VecMask);
18692 IsNonPoisoned &= IsNotPoisonedVec;
18693 } else {
18694 IsUsedInExpr = false;
18695 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
18696 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
18697 IsNonPoisoned &=
18698 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
18699 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
18700 }
18701 }
18702 }
18703 // Try to figure out best way to combine values: build a shuffle and insert
18704 // elements or just build several shuffles.
18705 // Insert non-constant scalars.
18706 SmallVector<Value *> NonConstants(GatheredScalars);
18707 int EMSz = ExtractMask.size();
18708 int MSz = Mask.size();
18709 // Try to build constant vector and shuffle with it only if currently we
18710 // have a single permutation and more than 1 scalar constants.
18711 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
18712 bool IsIdentityShuffle =
18713 ((UseVecBaseAsInput ||
18714 all_of(ExtractShuffles,
18715 [](const std::optional<TTI::ShuffleKind> &SK) {
18716 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18718 })) &&
18719 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
18720 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
18721 (!GatherShuffles.empty() &&
18722 all_of(GatherShuffles,
18723 [](const std::optional<TTI::ShuffleKind> &SK) {
18724 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18726 }) &&
18727 none_of(Mask, [&](int I) { return I >= MSz; }) &&
18729 bool EnoughConstsForShuffle =
18730 IsSingleShuffle &&
18731 (none_of(GatheredScalars,
18732 [](Value *V) {
18733 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18734 }) ||
18735 any_of(GatheredScalars,
18736 [](Value *V) {
18737 return isa<Constant>(V) && !isa<UndefValue>(V);
18738 })) &&
18739 (!IsIdentityShuffle ||
18740 (GatheredScalars.size() == 2 &&
18741 any_of(GatheredScalars,
18742 [](Value *V) { return !isa<UndefValue>(V); })) ||
18743 count_if(GatheredScalars, [](Value *V) {
18744 return isa<Constant>(V) && !isa<PoisonValue>(V);
18745 }) > 1);
18746 // NonConstants array contains just non-constant values, GatheredScalars
18747 // contains only constant to build final vector and then shuffle.
18748 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
18749 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
18750 NonConstants[I] = PoisonValue::get(OrigScalarTy);
18751 else
18752 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18753 }
18754 // Generate constants for final shuffle and build a mask for them.
18755 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
18756 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
18757 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
18758 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18759 ShuffleBuilder.add(BV, BVMask);
18760 }
18761 if (all_of(NonConstants, [=](Value *V) {
18762 return isa<PoisonValue>(V) ||
18763 (IsSingleShuffle && ((IsIdentityShuffle &&
18764 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
18765 }))
18766 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18767 SubVectorsMask);
18768 else
18769 Res = ShuffleBuilder.finalize(
18770 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
18771 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
18772 bool IsSplat = isSplat(NonConstants);
18773 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18774 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
18775 auto CheckIfSplatIsProfitable = [&]() {
18776 // Estimate the cost of splatting + shuffle and compare with
18777 // insert + shuffle.
18778 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18779 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18780 if (isa<ExtractElementInst>(V) || isVectorized(V))
18781 return false;
18782 InstructionCost SplatCost = TTI->getVectorInstrCost(
18783 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
18784 PoisonValue::get(VecTy), V);
18785 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18786 for (auto [Idx, I] : enumerate(BVMask))
18787 if (I != PoisonMaskElem)
18788 NewMask[Idx] = Mask.size();
18789 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18790 NewMask, CostKind);
18791 InstructionCost BVCost = TTI->getVectorInstrCost(
18792 Instruction::InsertElement, VecTy, CostKind,
18793 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18794 Vec, V);
18795 // Shuffle required?
18796 if (count(BVMask, PoisonMaskElem) <
18797 static_cast<int>(BVMask.size() - 1)) {
18798 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18799 for (auto [Idx, I] : enumerate(BVMask))
18800 if (I != PoisonMaskElem)
18801 NewMask[Idx] = I;
18802 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18803 VecTy, NewMask, CostKind);
18804 }
18805 return SplatCost <= BVCost;
18806 };
18807 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18808 for (auto [Idx, I] : enumerate(BVMask))
18809 if (I != PoisonMaskElem)
18810 Mask[Idx] = I;
18811 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
18812 } else {
18813 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18814 SmallVector<Value *> Values(NonConstants.size(),
18815 PoisonValue::get(ScalarTy));
18816 Values[0] = V;
18817 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18818 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
18819 transform(BVMask, SplatMask.begin(), [](int I) {
18820 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18821 });
18822 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
18823 BV = CreateShuffle(BV, nullptr, SplatMask);
18824 for (auto [Idx, I] : enumerate(BVMask))
18825 if (I != PoisonMaskElem)
18826 Mask[Idx] = BVMask.size() + Idx;
18827 Vec = CreateShuffle(Vec, BV, Mask);
18828 for (auto [Idx, I] : enumerate(Mask))
18829 if (I != PoisonMaskElem)
18830 Mask[Idx] = Idx;
18831 }
18832 });
18833 } else if (!allConstant(GatheredScalars)) {
18834 // Gather unique scalars and all constants.
18835 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
18836 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
18837 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
18838 ShuffleBuilder.add(BV, ReuseMask);
18839 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18840 SubVectorsMask);
18841 } else {
18842 // Gather all constants.
18843 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
18844 for (auto [I, V] : enumerate(GatheredScalars)) {
18845 if (!isa<PoisonValue>(V))
18846 Mask[I] = I;
18847 }
18848 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18849 ShuffleBuilder.add(BV, Mask);
18850 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18851 SubVectorsMask);
18852 }
18853
18854 if (NeedFreeze)
18855 Res = ShuffleBuilder.createFreeze(Res);
18856 return Res;
18857}
18858
18859Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
18860 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
18861 (void)vectorizeTree(VectorizableTree[EIdx].get());
18862 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
18863 Builder, *this);
18864}
18865
18866/// \returns \p I after propagating metadata from \p VL only for instructions in
18867/// \p VL.
18870 for (Value *V : VL)
18871 if (isa<Instruction>(V))
18872 Insts.push_back(V);
18873 return llvm::propagateMetadata(Inst, Insts);
18874}
18875
18877 if (DebugLoc DL = PN.getDebugLoc())
18878 return DL;
18879 return DebugLoc::getUnknown();
18880}
18881
18882Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
18883 IRBuilderBase::InsertPointGuard Guard(Builder);
18884
18885 Value *V = E->Scalars.front();
18886 Type *ScalarTy = V->getType();
18887 if (!isa<CmpInst>(V))
18888 ScalarTy = getValueType(V);
18889 auto It = MinBWs.find(E);
18890 if (It != MinBWs.end()) {
18891 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
18892 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
18893 if (VecTy)
18894 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
18895 }
18896 if (E->VectorizedValue)
18897 return E->VectorizedValue;
18898 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
18899 if (E->isGather()) {
18900 // Set insert point for non-reduction initial nodes.
18901 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
18902 setInsertPointAfterBundle(E);
18903 Value *Vec = createBuildVector(E, ScalarTy);
18904 E->VectorizedValue = Vec;
18905 return Vec;
18906 }
18907 if (E->State == TreeEntry::SplitVectorize) {
18908 assert(E->CombinedEntriesWithIndices.size() == 2 &&
18909 "Expected exactly 2 combined entries.");
18910 setInsertPointAfterBundle(E);
18911 TreeEntry &OpTE1 =
18912 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
18913 assert(OpTE1.isSame(
18914 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
18915 "Expected same first part of scalars.");
18916 Value *Op1 = vectorizeTree(&OpTE1);
18917 TreeEntry &OpTE2 =
18918 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
18919 assert(
18920 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
18921 "Expected same second part of scalars.");
18922 Value *Op2 = vectorizeTree(&OpTE2);
18923 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
18924 bool IsSigned = false;
18925 auto It = MinBWs.find(OpE);
18926 if (It != MinBWs.end())
18927 IsSigned = It->second.second;
18928 else
18929 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
18930 if (isa<PoisonValue>(V))
18931 return false;
18932 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18933 });
18934 return IsSigned;
18935 };
18936 if (cast<VectorType>(Op1->getType())->getElementType() !=
18937 ScalarTy->getScalarType()) {
18938 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
18939 Op1 = Builder.CreateIntCast(
18940 Op1,
18942 ScalarTy,
18943 cast<FixedVectorType>(Op1->getType())->getNumElements()),
18944 GetOperandSignedness(&OpTE1));
18945 }
18946 if (cast<VectorType>(Op2->getType())->getElementType() !=
18947 ScalarTy->getScalarType()) {
18948 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
18949 Op2 = Builder.CreateIntCast(
18950 Op2,
18952 ScalarTy,
18953 cast<FixedVectorType>(Op2->getType())->getNumElements()),
18954 GetOperandSignedness(&OpTE2));
18955 }
18956 if (E->ReorderIndices.empty()) {
18957 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
18958 std::iota(
18959 Mask.begin(),
18960 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
18961 0);
18962 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18963 if (ScalarTyNumElements != 1) {
18964 assert(SLPReVec && "Only supported by REVEC.");
18965 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
18966 }
18967 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
18968 Vec = createInsertVector(Builder, Vec, Op2,
18969 E->CombinedEntriesWithIndices.back().second *
18970 ScalarTyNumElements);
18971 E->VectorizedValue = Vec;
18972 return Vec;
18973 }
18974 unsigned CommonVF =
18975 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
18976 if (getNumElements(Op1->getType()) != CommonVF) {
18977 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
18978 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
18979 0);
18980 Op1 = Builder.CreateShuffleVector(Op1, Mask);
18981 }
18982 if (getNumElements(Op2->getType()) != CommonVF) {
18983 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
18984 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
18985 0);
18986 Op2 = Builder.CreateShuffleVector(Op2, Mask);
18987 }
18988 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
18989 E->VectorizedValue = Vec;
18990 return Vec;
18991 }
18992
18993 bool IsReverseOrder =
18994 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
18995 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
18996 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
18997 if (E->getOpcode() == Instruction::Store &&
18998 E->State == TreeEntry::Vectorize) {
18999 ArrayRef<int> Mask =
19000 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19001 E->ReorderIndices.size());
19002 ShuffleBuilder.add(V, Mask);
19003 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19004 E->State == TreeEntry::CompressVectorize) {
19005 ShuffleBuilder.addOrdered(V, {});
19006 } else {
19007 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19008 }
19010 E->CombinedEntriesWithIndices.size());
19011 transform(
19012 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19013 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19014 });
19015 assert(
19016 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19017 "Expected either combined subnodes or reordering");
19018 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19019 };
19020
19021 assert(!E->isGather() && "Unhandled state");
19022 unsigned ShuffleOrOp =
19023 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19024 Instruction *VL0 = E->getMainOp();
19025 auto GetOperandSignedness = [&](unsigned Idx) {
19026 const TreeEntry *OpE = getOperandEntry(E, Idx);
19027 bool IsSigned = false;
19028 auto It = MinBWs.find(OpE);
19029 if (It != MinBWs.end())
19030 IsSigned = It->second.second;
19031 else
19032 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19033 if (isa<PoisonValue>(V))
19034 return false;
19035 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19036 });
19037 return IsSigned;
19038 };
19039 switch (ShuffleOrOp) {
19040 case Instruction::PHI: {
19041 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19042 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19043 "PHI reordering is free.");
19044 auto *PH = cast<PHINode>(VL0);
19045 Builder.SetInsertPoint(PH->getParent(),
19046 PH->getParent()->getFirstNonPHIIt());
19047 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19048 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19049 Value *V = NewPhi;
19050
19051 // Adjust insertion point once all PHI's have been generated.
19052 Builder.SetInsertPoint(PH->getParent(),
19053 PH->getParent()->getFirstInsertionPt());
19054 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19055
19056 V = FinalShuffle(V, E);
19057
19058 E->VectorizedValue = V;
19059 // If phi node is fully emitted - exit.
19060 if (NewPhi->getNumIncomingValues() != 0)
19061 return NewPhi;
19062
19063 // PHINodes may have multiple entries from the same block. We want to
19064 // visit every block once.
19065 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19066
19067 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19068 BasicBlock *IBB = PH->getIncomingBlock(I);
19069
19070 // Stop emission if all incoming values are generated.
19071 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19072 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19073 return NewPhi;
19074 }
19075
19076 if (!VisitedBBs.insert(IBB).second) {
19077 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19078 NewPhi->addIncoming(VecOp, IBB);
19079 TreeEntry *OpTE = getOperandEntry(E, I);
19080 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19081 OpTE->VectorizedValue = VecOp;
19082 continue;
19083 }
19084
19085 Builder.SetInsertPoint(IBB->getTerminator());
19086 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19087 Value *Vec = vectorizeOperand(E, I);
19088 if (VecTy != Vec->getType()) {
19089 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19090 MinBWs.contains(getOperandEntry(E, I))) &&
19091 "Expected item in MinBWs.");
19092 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19093 }
19094 NewPhi->addIncoming(Vec, IBB);
19095 }
19096
19097 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19098 "Invalid number of incoming values");
19099 assert(E->VectorizedValue && "Expected vectorized value.");
19100 return E->VectorizedValue;
19101 }
19102
19103 case Instruction::ExtractElement: {
19104 Value *V = E->getSingleOperand(0);
19105 setInsertPointAfterBundle(E);
19106 V = FinalShuffle(V, E);
19107 E->VectorizedValue = V;
19108 return V;
19109 }
19110 case Instruction::ExtractValue: {
19111 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19112 Builder.SetInsertPoint(LI);
19113 Value *Ptr = LI->getPointerOperand();
19114 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19115 Value *NewV = ::propagateMetadata(V, E->Scalars);
19116 NewV = FinalShuffle(NewV, E);
19117 E->VectorizedValue = NewV;
19118 return NewV;
19119 }
19120 case Instruction::InsertElement: {
19121 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
19122 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
19123 Value *V = vectorizeOperand(E, 1);
19124 ArrayRef<Value *> Op = E->getOperand(1);
19125 Type *ScalarTy = Op.front()->getType();
19126 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
19127 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19128 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
19129 assert(Res.first > 0 && "Expected item in MinBWs.");
19130 V = Builder.CreateIntCast(
19131 V,
19133 ScalarTy,
19134 cast<FixedVectorType>(V->getType())->getNumElements()),
19135 Res.second);
19136 }
19137
19138 // Create InsertVector shuffle if necessary
19139 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
19140 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19141 }));
19142 const unsigned NumElts =
19143 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19144 const unsigned NumScalars = E->Scalars.size();
19145
19146 unsigned Offset = *getElementIndex(VL0);
19147 assert(Offset < NumElts && "Failed to find vector index offset");
19148
19149 // Create shuffle to resize vector
19150 SmallVector<int> Mask;
19151 if (!E->ReorderIndices.empty()) {
19152 inversePermutation(E->ReorderIndices, Mask);
19153 Mask.append(NumElts - NumScalars, PoisonMaskElem);
19154 } else {
19155 Mask.assign(NumElts, PoisonMaskElem);
19156 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
19157 }
19158 // Create InsertVector shuffle if necessary
19159 bool IsIdentity = true;
19160 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
19161 Mask.swap(PrevMask);
19162 for (unsigned I = 0; I < NumScalars; ++I) {
19163 Value *Scalar = E->Scalars[PrevMask[I]];
19164 unsigned InsertIdx = *getElementIndex(Scalar);
19165 IsIdentity &= InsertIdx - Offset == I;
19166 Mask[InsertIdx - Offset] = I;
19167 }
19168 if (!IsIdentity || NumElts != NumScalars) {
19169 Value *V2 = nullptr;
19170 bool IsVNonPoisonous =
19172 SmallVector<int> InsertMask(Mask);
19173 if (NumElts != NumScalars && Offset == 0) {
19174 // Follow all insert element instructions from the current buildvector
19175 // sequence.
19176 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
19177 do {
19178 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
19179 if (!InsertIdx)
19180 break;
19181 if (InsertMask[*InsertIdx] == PoisonMaskElem)
19182 InsertMask[*InsertIdx] = *InsertIdx;
19183 if (!Ins->hasOneUse())
19184 break;
19186 Ins->getUniqueUndroppableUser());
19187 } while (Ins);
19188 SmallBitVector UseMask =
19189 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19190 SmallBitVector IsFirstPoison =
19191 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19192 SmallBitVector IsFirstUndef =
19193 isUndefVector(FirstInsert->getOperand(0), UseMask);
19194 if (!IsFirstPoison.all()) {
19195 unsigned Idx = 0;
19196 for (unsigned I = 0; I < NumElts; I++) {
19197 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
19198 IsFirstUndef.test(I)) {
19199 if (IsVNonPoisonous) {
19200 InsertMask[I] = I < NumScalars ? I : 0;
19201 continue;
19202 }
19203 if (!V2)
19204 V2 = UndefValue::get(V->getType());
19205 if (Idx >= NumScalars)
19206 Idx = NumScalars - 1;
19207 InsertMask[I] = NumScalars + Idx;
19208 ++Idx;
19209 } else if (InsertMask[I] != PoisonMaskElem &&
19210 Mask[I] == PoisonMaskElem) {
19211 InsertMask[I] = PoisonMaskElem;
19212 }
19213 }
19214 } else {
19215 InsertMask = Mask;
19216 }
19217 }
19218 if (!V2)
19219 V2 = PoisonValue::get(V->getType());
19220 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19221 if (auto *I = dyn_cast<Instruction>(V)) {
19222 GatherShuffleExtractSeq.insert(I);
19223 CSEBlocks.insert(I->getParent());
19224 }
19225 }
19226
19227 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
19228 for (unsigned I = 0; I < NumElts; I++) {
19229 if (Mask[I] != PoisonMaskElem)
19230 InsertMask[Offset + I] = I;
19231 }
19232 SmallBitVector UseMask =
19233 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19234 SmallBitVector IsFirstUndef =
19235 isUndefVector(FirstInsert->getOperand(0), UseMask);
19236 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
19237 NumElts != NumScalars) {
19238 if (IsFirstUndef.all()) {
19239 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
19240 SmallBitVector IsFirstPoison =
19241 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19242 if (!IsFirstPoison.all()) {
19243 for (unsigned I = 0; I < NumElts; I++) {
19244 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
19245 InsertMask[I] = I + NumElts;
19246 }
19247 }
19248 V = Builder.CreateShuffleVector(
19249 V,
19250 IsFirstPoison.all() ? PoisonValue::get(V->getType())
19251 : FirstInsert->getOperand(0),
19252 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
19253 if (auto *I = dyn_cast<Instruction>(V)) {
19254 GatherShuffleExtractSeq.insert(I);
19255 CSEBlocks.insert(I->getParent());
19256 }
19257 }
19258 } else {
19259 SmallBitVector IsFirstPoison =
19260 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19261 for (unsigned I = 0; I < NumElts; I++) {
19262 if (InsertMask[I] == PoisonMaskElem)
19263 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
19264 else
19265 InsertMask[I] += NumElts;
19266 }
19267 V = Builder.CreateShuffleVector(
19268 FirstInsert->getOperand(0), V, InsertMask,
19269 cast<Instruction>(E->Scalars.back())->getName());
19270 if (auto *I = dyn_cast<Instruction>(V)) {
19271 GatherShuffleExtractSeq.insert(I);
19272 CSEBlocks.insert(I->getParent());
19273 }
19274 }
19275 }
19276
19277 ++NumVectorInstructions;
19278 E->VectorizedValue = V;
19279 return V;
19280 }
19281 case Instruction::ZExt:
19282 case Instruction::SExt:
19283 case Instruction::FPToUI:
19284 case Instruction::FPToSI:
19285 case Instruction::FPExt:
19286 case Instruction::PtrToInt:
19287 case Instruction::IntToPtr:
19288 case Instruction::SIToFP:
19289 case Instruction::UIToFP:
19290 case Instruction::Trunc:
19291 case Instruction::FPTrunc:
19292 case Instruction::BitCast: {
19293 setInsertPointAfterBundle(E);
19294
19295 Value *InVec = vectorizeOperand(E, 0);
19296
19297 auto *CI = cast<CastInst>(VL0);
19298 Instruction::CastOps VecOpcode = CI->getOpcode();
19299 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
19300 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
19301 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
19302 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19303 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
19304 // Check if the values are candidates to demote.
19305 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19306 if (SrcIt != MinBWs.end())
19307 SrcBWSz = SrcIt->second.first;
19308 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
19309 if (BWSz == SrcBWSz) {
19310 VecOpcode = Instruction::BitCast;
19311 } else if (BWSz < SrcBWSz) {
19312 VecOpcode = Instruction::Trunc;
19313 } else if (It != MinBWs.end()) {
19314 assert(BWSz > SrcBWSz && "Invalid cast!");
19315 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19316 } else if (SrcIt != MinBWs.end()) {
19317 assert(BWSz > SrcBWSz && "Invalid cast!");
19318 VecOpcode =
19319 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19320 }
19321 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19322 !SrcIt->second.second) {
19323 VecOpcode = Instruction::UIToFP;
19324 }
19325 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19326 ? InVec
19327 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19328 V = FinalShuffle(V, E);
19329
19330 E->VectorizedValue = V;
19331 ++NumVectorInstructions;
19332 return V;
19333 }
19334 case Instruction::FCmp:
19335 case Instruction::ICmp: {
19336 setInsertPointAfterBundle(E);
19337
19338 Value *L = vectorizeOperand(E, 0);
19339 Value *R = vectorizeOperand(E, 1);
19340 if (L->getType() != R->getType()) {
19341 assert((getOperandEntry(E, 0)->isGather() ||
19342 getOperandEntry(E, 1)->isGather() ||
19343 MinBWs.contains(getOperandEntry(E, 0)) ||
19344 MinBWs.contains(getOperandEntry(E, 1))) &&
19345 "Expected item in MinBWs.");
19346 if (cast<VectorType>(L->getType())
19347 ->getElementType()
19348 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
19349 ->getElementType()
19350 ->getIntegerBitWidth()) {
19351 Type *CastTy = R->getType();
19352 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19353 } else {
19354 Type *CastTy = L->getType();
19355 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19356 }
19357 }
19358
19359 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
19360 Value *V = Builder.CreateCmp(P0, L, R);
19361 propagateIRFlags(V, E->Scalars, VL0);
19362 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
19363 ICmp->setSameSign(/*B=*/false);
19364 // Do not cast for cmps.
19365 VecTy = cast<FixedVectorType>(V->getType());
19366 V = FinalShuffle(V, E);
19367
19368 E->VectorizedValue = V;
19369 ++NumVectorInstructions;
19370 return V;
19371 }
19372 case Instruction::Select: {
19373 setInsertPointAfterBundle(E);
19374
19375 Value *Cond = vectorizeOperand(E, 0);
19376 Value *True = vectorizeOperand(E, 1);
19377 Value *False = vectorizeOperand(E, 2);
19378 if (True->getType() != VecTy || False->getType() != VecTy) {
19379 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
19380 getOperandEntry(E, 2)->isGather() ||
19381 MinBWs.contains(getOperandEntry(E, 1)) ||
19382 MinBWs.contains(getOperandEntry(E, 2))) &&
19383 "Expected item in MinBWs.");
19384 if (True->getType() != VecTy)
19385 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19386 if (False->getType() != VecTy)
19387 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19388 }
19389
19390 unsigned CondNumElements = getNumElements(Cond->getType());
19391 unsigned TrueNumElements = getNumElements(True->getType());
19392 assert(TrueNumElements >= CondNumElements &&
19393 TrueNumElements % CondNumElements == 0 &&
19394 "Cannot vectorize Instruction::Select");
19395 assert(TrueNumElements == getNumElements(False->getType()) &&
19396 "Cannot vectorize Instruction::Select");
19397 if (CondNumElements != TrueNumElements) {
19398 // When the return type is i1 but the source is fixed vector type, we
19399 // need to duplicate the condition value.
19400 Cond = Builder.CreateShuffleVector(
19401 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
19402 CondNumElements));
19403 }
19404 assert(getNumElements(Cond->getType()) == TrueNumElements &&
19405 "Cannot vectorize Instruction::Select");
19406 Value *V = Builder.CreateSelect(Cond, True, False);
19407 V = FinalShuffle(V, E);
19408
19409 E->VectorizedValue = V;
19410 ++NumVectorInstructions;
19411 return V;
19412 }
19413 case Instruction::FNeg: {
19414 setInsertPointAfterBundle(E);
19415
19416 Value *Op = vectorizeOperand(E, 0);
19417
19418 Value *V = Builder.CreateUnOp(
19419 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
19420 propagateIRFlags(V, E->Scalars, VL0);
19421 if (auto *I = dyn_cast<Instruction>(V))
19422 V = ::propagateMetadata(I, E->Scalars);
19423
19424 V = FinalShuffle(V, E);
19425
19426 E->VectorizedValue = V;
19427 ++NumVectorInstructions;
19428
19429 return V;
19430 }
19431 case Instruction::Freeze: {
19432 setInsertPointAfterBundle(E);
19433
19434 Value *Op = vectorizeOperand(E, 0);
19435
19436 if (Op->getType() != VecTy) {
19437 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19438 MinBWs.contains(getOperandEntry(E, 0))) &&
19439 "Expected item in MinBWs.");
19440 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
19441 }
19442 Value *V = Builder.CreateFreeze(Op);
19443 V = FinalShuffle(V, E);
19444
19445 E->VectorizedValue = V;
19446 ++NumVectorInstructions;
19447
19448 return V;
19449 }
19450 case Instruction::Add:
19451 case Instruction::FAdd:
19452 case Instruction::Sub:
19453 case Instruction::FSub:
19454 case Instruction::Mul:
19455 case Instruction::FMul:
19456 case Instruction::UDiv:
19457 case Instruction::SDiv:
19458 case Instruction::FDiv:
19459 case Instruction::URem:
19460 case Instruction::SRem:
19461 case Instruction::FRem:
19462 case Instruction::Shl:
19463 case Instruction::LShr:
19464 case Instruction::AShr:
19465 case Instruction::And:
19466 case Instruction::Or:
19467 case Instruction::Xor: {
19468 setInsertPointAfterBundle(E);
19469
19470 Value *LHS = vectorizeOperand(E, 0);
19471 Value *RHS = vectorizeOperand(E, 1);
19472 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19473 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19474 ArrayRef<Value *> Ops = E->getOperand(I);
19475 if (all_of(Ops, [&](Value *Op) {
19476 auto *CI = dyn_cast<ConstantInt>(Op);
19477 return CI && CI->getValue().countr_one() >= It->second.first;
19478 })) {
19479 V = FinalShuffle(I == 0 ? RHS : LHS, E);
19480 E->VectorizedValue = V;
19481 ++NumVectorInstructions;
19482 return V;
19483 }
19484 }
19485 }
19486 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
19487 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19488 getOperandEntry(E, 1)->isGather() ||
19489 MinBWs.contains(getOperandEntry(E, 0)) ||
19490 MinBWs.contains(getOperandEntry(E, 1))) &&
19491 "Expected item in MinBWs.");
19492 if (LHS->getType() != VecTy)
19493 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
19494 if (RHS->getType() != VecTy)
19495 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
19496 }
19497
19498 Value *V = Builder.CreateBinOp(
19499 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
19500 RHS);
19501 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
19502 if (auto *I = dyn_cast<Instruction>(V)) {
19503 V = ::propagateMetadata(I, E->Scalars);
19504 // Drop nuw flags for abs(sub(commutative), true).
19505 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19506 any_of(E->Scalars, [](Value *V) {
19507 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19508 }))
19509 I->setHasNoUnsignedWrap(/*b=*/false);
19510 }
19511
19512 V = FinalShuffle(V, E);
19513
19514 E->VectorizedValue = V;
19515 ++NumVectorInstructions;
19516
19517 return V;
19518 }
19519 case Instruction::Load: {
19520 // Loads are inserted at the head of the tree because we don't want to
19521 // sink them all the way down past store instructions.
19522 setInsertPointAfterBundle(E);
19523
19524 LoadInst *LI = cast<LoadInst>(VL0);
19525 Instruction *NewLI;
19526 FixedVectorType *StridedLoadTy = nullptr;
19527 Value *PO = LI->getPointerOperand();
19528 if (E->State == TreeEntry::Vectorize) {
19529 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19530 } else if (E->State == TreeEntry::CompressVectorize) {
19531 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19532 CompressEntryToData.at(E);
19533 Align CommonAlignment = LI->getAlign();
19534 if (IsMasked) {
19535 unsigned VF = getNumElements(LoadVecTy);
19536 SmallVector<Constant *> MaskValues(
19537 VF / getNumElements(LI->getType()),
19538 ConstantInt::getFalse(VecTy->getContext()));
19539 for (int I : CompressMask)
19540 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
19541 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19542 assert(SLPReVec && "Only supported by REVEC.");
19543 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
19544 }
19545 Constant *MaskValue = ConstantVector::get(MaskValues);
19546 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19547 MaskValue);
19548 } else {
19549 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19550 }
19551 NewLI = ::propagateMetadata(NewLI, E->Scalars);
19552 // TODO: include this cost into CommonCost.
19553 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19554 assert(SLPReVec && "FixedVectorType is not expected.");
19555 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
19556 CompressMask);
19557 }
19558 NewLI =
19559 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
19560 } else if (E->State == TreeEntry::StridedVectorize) {
19561 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19562 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19563 PO = IsReverseOrder ? PtrN : Ptr0;
19564 Type *StrideTy = DL->getIndexType(PO->getType());
19565 Value *StrideVal;
19566 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
19567 StridedLoadTy = SPtrInfo.Ty;
19568 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
19569 unsigned StridedLoadEC =
19570 StridedLoadTy->getElementCount().getKnownMinValue();
19571
19572 Value *Stride = SPtrInfo.StrideVal;
19573 if (!Stride) {
19574 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19575 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
19576 SCEVExpander Expander(*SE, *DL, "strided-load-vec");
19577 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
19578 &*Builder.GetInsertPoint());
19579 }
19580 Value *NewStride =
19581 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
19582 StrideVal = Builder.CreateMul(
19583 NewStride, ConstantInt::get(
19584 StrideTy, (IsReverseOrder ? -1 : 1) *
19585 static_cast<int>(
19586 DL->getTypeAllocSize(ScalarTy))));
19587 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19588 auto *Inst = Builder.CreateIntrinsic(
19589 Intrinsic::experimental_vp_strided_load,
19590 {StridedLoadTy, PO->getType(), StrideTy},
19591 {PO, StrideVal,
19592 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
19593 Builder.getInt32(StridedLoadEC)});
19594 Inst->addParamAttr(
19595 /*ArgNo=*/0,
19596 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19597 NewLI = Inst;
19598 } else {
19599 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
19600 Value *VecPtr = vectorizeOperand(E, 0);
19601 if (isa<FixedVectorType>(ScalarTy)) {
19602 assert(SLPReVec && "FixedVectorType is not expected.");
19603 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
19604 // to expand VecPtr if ScalarTy is a vector type.
19605 unsigned ScalarTyNumElements =
19606 cast<FixedVectorType>(ScalarTy)->getNumElements();
19607 unsigned VecTyNumElements =
19608 cast<FixedVectorType>(VecTy)->getNumElements();
19609 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19610 "Cannot expand getelementptr.");
19611 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19612 SmallVector<Constant *> Indices(VecTyNumElements);
19613 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
19614 return Builder.getInt64(I % ScalarTyNumElements);
19615 });
19616 VecPtr = Builder.CreateGEP(
19617 VecTy->getElementType(),
19618 Builder.CreateShuffleVector(
19619 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
19620 ConstantVector::get(Indices));
19621 }
19622 // Use the minimum alignment of the gathered loads.
19623 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19624 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19625 }
19626 Value *V = E->State == TreeEntry::CompressVectorize
19627 ? NewLI
19628 : ::propagateMetadata(NewLI, E->Scalars);
19629
19630 V = FinalShuffle(V, E);
19631 E->VectorizedValue = V;
19632 ++NumVectorInstructions;
19633 return V;
19634 }
19635 case Instruction::Store: {
19636 auto *SI = cast<StoreInst>(VL0);
19637
19638 setInsertPointAfterBundle(E);
19639
19640 Value *VecValue = vectorizeOperand(E, 0);
19641 if (VecValue->getType() != VecTy)
19642 VecValue =
19643 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19644 VecValue = FinalShuffle(VecValue, E);
19645
19646 Value *Ptr = SI->getPointerOperand();
19647 Instruction *ST;
19648 if (E->State == TreeEntry::Vectorize) {
19649 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
19650 } else {
19651 assert(E->State == TreeEntry::StridedVectorize &&
19652 "Expected either strided or consecutive stores.");
19653 if (!E->ReorderIndices.empty()) {
19654 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
19655 Ptr = SI->getPointerOperand();
19656 }
19657 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
19658 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
19659 auto *Inst = Builder.CreateIntrinsic(
19660 Intrinsic::experimental_vp_strided_store,
19661 {VecTy, Ptr->getType(), StrideTy},
19662 {VecValue, Ptr,
19663 ConstantInt::get(
19664 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19665 Builder.getAllOnesMask(VecTy->getElementCount()),
19666 Builder.getInt32(E->Scalars.size())});
19667 Inst->addParamAttr(
19668 /*ArgNo=*/1,
19669 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19670 ST = Inst;
19671 }
19672
19673 Value *V = ::propagateMetadata(ST, E->Scalars);
19674
19675 E->VectorizedValue = V;
19676 ++NumVectorInstructions;
19677 return V;
19678 }
19679 case Instruction::GetElementPtr: {
19680 auto *GEP0 = cast<GetElementPtrInst>(VL0);
19681 setInsertPointAfterBundle(E);
19682
19683 Value *Op0 = vectorizeOperand(E, 0);
19684
19685 SmallVector<Value *> OpVecs;
19686 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
19687 Value *OpVec = vectorizeOperand(E, J);
19688 OpVecs.push_back(OpVec);
19689 }
19690
19691 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19692 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
19694 for (Value *V : E->Scalars) {
19696 GEPs.push_back(V);
19697 }
19698 V = ::propagateMetadata(I, GEPs);
19699 }
19700
19701 V = FinalShuffle(V, E);
19702
19703 E->VectorizedValue = V;
19704 ++NumVectorInstructions;
19705
19706 return V;
19707 }
19708 case Instruction::Call: {
19709 CallInst *CI = cast<CallInst>(VL0);
19710 setInsertPointAfterBundle(E);
19711
19713
19715 CI, ID, VecTy->getNumElements(),
19716 It != MinBWs.end() ? It->second.first : 0, TTI);
19717 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
19718 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
19719 VecCallCosts.first <= VecCallCosts.second;
19720
19721 Value *ScalarArg = nullptr;
19722 SmallVector<Value *> OpVecs;
19723 SmallVector<Type *, 2> TysForDecl;
19724 // Add return type if intrinsic is overloaded on it.
19725 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
19726 TysForDecl.push_back(VecTy);
19727 auto *CEI = cast<CallInst>(VL0);
19728 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
19729 // Some intrinsics have scalar arguments. This argument should not be
19730 // vectorized.
19731 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
19732 ScalarArg = CEI->getArgOperand(I);
19733 // if decided to reduce bitwidth of abs intrinsic, it second argument
19734 // must be set false (do not return poison, if value issigned min).
19735 if (ID == Intrinsic::abs && It != MinBWs.end() &&
19736 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19737 ScalarArg = Builder.getFalse();
19738 OpVecs.push_back(ScalarArg);
19740 TysForDecl.push_back(ScalarArg->getType());
19741 continue;
19742 }
19743
19744 Value *OpVec = vectorizeOperand(E, I);
19745 ScalarArg = CEI->getArgOperand(I);
19746 if (cast<VectorType>(OpVec->getType())->getElementType() !=
19747 ScalarArg->getType()->getScalarType() &&
19748 It == MinBWs.end()) {
19749 auto *CastTy =
19750 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
19751 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
19752 } else if (It != MinBWs.end()) {
19753 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
19754 }
19755 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
19756 OpVecs.push_back(OpVec);
19757 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
19758 TysForDecl.push_back(OpVec->getType());
19759 }
19760
19761 Function *CF;
19762 if (!UseIntrinsic) {
19763 VFShape Shape =
19765 ElementCount::getFixed(VecTy->getNumElements()),
19766 false /*HasGlobalPred*/);
19767 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19768 } else {
19769 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
19770 }
19771
19773 CI->getOperandBundlesAsDefs(OpBundles);
19774 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
19775
19776 propagateIRFlags(V, E->Scalars, VL0);
19777 V = FinalShuffle(V, E);
19778
19779 E->VectorizedValue = V;
19780 ++NumVectorInstructions;
19781 return V;
19782 }
19783 case Instruction::ShuffleVector: {
19784 Value *V;
19785 if (SLPReVec && !E->isAltShuffle()) {
19786 setInsertPointAfterBundle(E);
19787 Value *Src = vectorizeOperand(E, 0);
19788 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
19789 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
19790 SmallVector<int> NewMask(ThisMask.size());
19791 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
19792 return SVSrc->getShuffleMask()[Mask];
19793 });
19794 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19795 SVSrc->getOperand(1), NewMask);
19796 } else {
19797 V = Builder.CreateShuffleVector(Src, ThisMask);
19798 }
19799 propagateIRFlags(V, E->Scalars, VL0);
19800 if (auto *I = dyn_cast<Instruction>(V))
19801 V = ::propagateMetadata(I, E->Scalars);
19802 V = FinalShuffle(V, E);
19803 } else {
19804 assert(E->isAltShuffle() &&
19805 ((Instruction::isBinaryOp(E->getOpcode()) &&
19806 Instruction::isBinaryOp(E->getAltOpcode())) ||
19807 (Instruction::isCast(E->getOpcode()) &&
19808 Instruction::isCast(E->getAltOpcode())) ||
19809 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
19810 "Invalid Shuffle Vector Operand");
19811
19812 Value *LHS = nullptr, *RHS = nullptr;
19813 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
19814 setInsertPointAfterBundle(E);
19815 LHS = vectorizeOperand(E, 0);
19816 RHS = vectorizeOperand(E, 1);
19817 } else {
19818 setInsertPointAfterBundle(E);
19819 LHS = vectorizeOperand(E, 0);
19820 }
19821 if (LHS && RHS &&
19822 ((Instruction::isBinaryOp(E->getOpcode()) &&
19823 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
19824 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
19825 assert((It != MinBWs.end() ||
19826 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
19827 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
19828 MinBWs.contains(getOperandEntry(E, 0)) ||
19829 MinBWs.contains(getOperandEntry(E, 1))) &&
19830 "Expected item in MinBWs.");
19831 Type *CastTy = VecTy;
19832 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
19834 ->getElementType()
19835 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
19836 ->getElementType()
19837 ->getIntegerBitWidth())
19838 CastTy = RHS->getType();
19839 else
19840 CastTy = LHS->getType();
19841 }
19842 if (LHS->getType() != CastTy)
19843 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
19844 if (RHS->getType() != CastTy)
19845 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
19846 }
19847
19848 Value *V0, *V1;
19849 if (Instruction::isBinaryOp(E->getOpcode())) {
19850 V0 = Builder.CreateBinOp(
19851 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
19852 V1 = Builder.CreateBinOp(
19853 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
19854 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
19855 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
19856 auto *AltCI = cast<CmpInst>(E->getAltOp());
19857 CmpInst::Predicate AltPred = AltCI->getPredicate();
19858 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
19859 } else {
19860 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
19861 unsigned SrcBWSz = DL->getTypeSizeInBits(
19862 cast<VectorType>(LHS->getType())->getElementType());
19863 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19864 if (BWSz <= SrcBWSz) {
19865 if (BWSz < SrcBWSz)
19866 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
19867 assert(LHS->getType() == VecTy &&
19868 "Expected same type as operand.");
19869 if (auto *I = dyn_cast<Instruction>(LHS))
19870 LHS = ::propagateMetadata(I, E->Scalars);
19871 LHS = FinalShuffle(LHS, E);
19872 E->VectorizedValue = LHS;
19873 ++NumVectorInstructions;
19874 return LHS;
19875 }
19876 }
19877 V0 = Builder.CreateCast(
19878 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
19879 V1 = Builder.CreateCast(
19880 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
19881 }
19882 // Add V0 and V1 to later analysis to try to find and remove matching
19883 // instruction, if any.
19884 for (Value *V : {V0, V1}) {
19885 if (auto *I = dyn_cast<Instruction>(V)) {
19886 GatherShuffleExtractSeq.insert(I);
19887 CSEBlocks.insert(I->getParent());
19888 }
19889 }
19890
19891 // Create shuffle to take alternate operations from the vector.
19892 // Also, gather up main and alt scalar ops to propagate IR flags to
19893 // each vector operation.
19894 ValueList OpScalars, AltScalars;
19895 SmallVector<int> Mask;
19896 E->buildAltOpShuffleMask(
19897 [E, this](Instruction *I) {
19898 assert(E->getMatchingMainOpOrAltOp(I) &&
19899 "Unexpected main/alternate opcode");
19900 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
19901 *TLI);
19902 },
19903 Mask, &OpScalars, &AltScalars);
19904
19905 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
19906 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
19907 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
19908 // Drop nuw flags for abs(sub(commutative), true).
19909 if (auto *I = dyn_cast<Instruction>(Vec);
19910 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
19911 any_of(E->Scalars, [](Value *V) {
19912 if (isa<PoisonValue>(V))
19913 return false;
19914 auto *IV = cast<Instruction>(V);
19915 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19916 }))
19917 I->setHasNoUnsignedWrap(/*b=*/false);
19918 };
19919 DropNuwFlag(V0, E->getOpcode());
19920 DropNuwFlag(V1, E->getAltOpcode());
19921
19922 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
19923 assert(SLPReVec && "FixedVectorType is not expected.");
19924 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
19925 }
19926 V = Builder.CreateShuffleVector(V0, V1, Mask);
19927 if (auto *I = dyn_cast<Instruction>(V)) {
19928 V = ::propagateMetadata(I, E->Scalars);
19929 GatherShuffleExtractSeq.insert(I);
19930 CSEBlocks.insert(I->getParent());
19931 }
19932 }
19933
19934 E->VectorizedValue = V;
19935 ++NumVectorInstructions;
19936
19937 return V;
19938 }
19939 default:
19940 llvm_unreachable("unknown inst");
19941 }
19942 return nullptr;
19943}
19944
19946 ExtraValueToDebugLocsMap ExternallyUsedValues;
19947 return vectorizeTree(ExternallyUsedValues);
19948}
19949
19951 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
19952 Instruction *ReductionRoot,
19953 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
19954 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
19955 // need to rebuild it.
19956 EntryToLastInstruction.clear();
19957 // All blocks must be scheduled before any instructions are inserted.
19958 for (auto &BSIter : BlocksSchedules)
19959 scheduleBlock(*this, BSIter.second.get());
19960 // Cache last instructions for the nodes to avoid side effects, which may
19961 // appear during vectorization, like extra uses, etc.
19962 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19963 if (TE->isGather())
19964 continue;
19965 (void)getLastInstructionInBundle(TE.get());
19966 }
19967
19968 if (ReductionRoot)
19969 Builder.SetInsertPoint(ReductionRoot->getParent(),
19970 ReductionRoot->getIterator());
19971 else
19972 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
19973
19974 // Vectorize gather operands of the nodes with the external uses only.
19976 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19977 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
19978 TE->UserTreeIndex.UserTE->hasState() &&
19979 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
19980 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
19981 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
19982 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
19983 all_of(TE->UserTreeIndex.UserTE->Scalars,
19984 [](Value *V) { return isUsedOutsideBlock(V); })) {
19985 Instruction &LastInst =
19986 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
19987 GatherEntries.emplace_back(TE.get(), &LastInst);
19988 }
19989 }
19990 for (auto &Entry : GatherEntries) {
19991 IRBuilderBase::InsertPointGuard Guard(Builder);
19992 Builder.SetInsertPoint(Entry.second);
19993 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
19994 (void)vectorizeTree(Entry.first);
19995 }
19996 // Emit gathered loads first to emit better code for the users of those
19997 // gathered loads.
19998 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19999 if (GatheredLoadsEntriesFirst.has_value() &&
20000 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20001 (!TE->isGather() || TE->UserTreeIndex)) {
20002 assert((TE->UserTreeIndex ||
20003 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20004 "Expected gathered load node.");
20005 (void)vectorizeTree(TE.get());
20006 }
20007 }
20008 (void)vectorizeTree(VectorizableTree[0].get());
20009 // Run through the list of postponed gathers and emit them, replacing the temp
20010 // emitted allocas with actual vector instructions.
20011 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20013 for (const TreeEntry *E : PostponedNodes) {
20014 auto *TE = const_cast<TreeEntry *>(E);
20015 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20016 TE->VectorizedValue = nullptr;
20017 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20018 // If user is a PHI node, its vector code have to be inserted right before
20019 // block terminator. Since the node was delayed, there were some unresolved
20020 // dependencies at the moment when stab instruction was emitted. In a case
20021 // when any of these dependencies turn out an operand of another PHI, coming
20022 // from this same block, position of a stab instruction will become invalid.
20023 // The is because source vector that supposed to feed this gather node was
20024 // inserted at the end of the block [after stab instruction]. So we need
20025 // to adjust insertion point again to the end of block.
20026 if (isa<PHINode>(UserI)) {
20027 // Insert before all users.
20028 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20029 for (User *U : PrevVec->users()) {
20030 if (U == UserI)
20031 continue;
20032 auto *UI = dyn_cast<Instruction>(U);
20033 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20034 continue;
20035 if (UI->comesBefore(InsertPt))
20036 InsertPt = UI;
20037 }
20038 Builder.SetInsertPoint(InsertPt);
20039 } else {
20040 Builder.SetInsertPoint(PrevVec);
20041 }
20042 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20043 Value *Vec = vectorizeTree(TE);
20044 if (auto *VecI = dyn_cast<Instruction>(Vec);
20045 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20046 Builder.GetInsertPoint()->comesBefore(VecI))
20047 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20048 Builder.GetInsertPoint());
20049 if (Vec->getType() != PrevVec->getType()) {
20050 assert(Vec->getType()->isIntOrIntVectorTy() &&
20051 PrevVec->getType()->isIntOrIntVectorTy() &&
20052 "Expected integer vector types only.");
20053 std::optional<bool> IsSigned;
20054 for (Value *V : TE->Scalars) {
20055 if (isVectorized(V)) {
20056 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20057 auto It = MinBWs.find(MNTE);
20058 if (It != MinBWs.end()) {
20059 IsSigned = IsSigned.value_or(false) || It->second.second;
20060 if (*IsSigned)
20061 break;
20062 }
20063 }
20064 if (IsSigned.value_or(false))
20065 break;
20066 // Scan through gather nodes.
20067 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20068 auto It = MinBWs.find(BVE);
20069 if (It != MinBWs.end()) {
20070 IsSigned = IsSigned.value_or(false) || It->second.second;
20071 if (*IsSigned)
20072 break;
20073 }
20074 }
20075 if (IsSigned.value_or(false))
20076 break;
20077 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20078 IsSigned =
20079 IsSigned.value_or(false) ||
20080 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20081 continue;
20082 }
20083 if (IsSigned.value_or(false))
20084 break;
20085 }
20086 }
20087 if (IsSigned.value_or(false)) {
20088 // Final attempt - check user node.
20089 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20090 if (It != MinBWs.end())
20091 IsSigned = It->second.second;
20092 }
20093 assert(IsSigned &&
20094 "Expected user node or perfect diamond match in MinBWs.");
20095 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20096 }
20097 PrevVec->replaceAllUsesWith(Vec);
20098 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20099 // Replace the stub vector node, if it was used before for one of the
20100 // buildvector nodes already.
20101 auto It = PostponedValues.find(PrevVec);
20102 if (It != PostponedValues.end()) {
20103 for (TreeEntry *VTE : It->getSecond())
20104 VTE->VectorizedValue = Vec;
20105 }
20106 eraseInstruction(PrevVec);
20107 }
20108
20109 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
20110 << " values .\n");
20111
20113 // Maps vector instruction to original insertelement instruction
20114 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
20115 // Maps extract Scalar to the corresponding extractelement instruction in the
20116 // basic block. Only one extractelement per block should be emitted.
20118 ScalarToEEs;
20119 SmallDenseSet<Value *, 4> UsedInserts;
20121 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
20123 // Extract all of the elements with the external uses.
20124 for (const auto &ExternalUse : ExternalUses) {
20125 Value *Scalar = ExternalUse.Scalar;
20126 llvm::User *User = ExternalUse.User;
20127
20128 // Skip users that we already RAUW. This happens when one instruction
20129 // has multiple uses of the same value.
20130 if (User && !is_contained(Scalar->users(), User))
20131 continue;
20132 const TreeEntry *E = &ExternalUse.E;
20133 assert(E && "Invalid scalar");
20134 assert(!E->isGather() && "Extracting from a gather list");
20135 // Non-instruction pointers are not deleted, just skip them.
20136 if (E->getOpcode() == Instruction::GetElementPtr &&
20137 !isa<GetElementPtrInst>(Scalar))
20138 continue;
20139
20140 Value *Vec = E->VectorizedValue;
20141 assert(Vec && "Can't find vectorizable value");
20142
20143 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20144 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
20145 if (Scalar->getType() != Vec->getType()) {
20146 Value *Ex = nullptr;
20147 Value *ExV = nullptr;
20148 auto *Inst = dyn_cast<Instruction>(Scalar);
20149 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20150 auto It = ScalarToEEs.find(Scalar);
20151 if (It != ScalarToEEs.end()) {
20152 // No need to emit many extracts, just move the only one in the
20153 // current block.
20154 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20155 : Builder.GetInsertBlock());
20156 if (EEIt != It->second.end()) {
20157 Value *PrevV = EEIt->second.first;
20158 if (auto *I = dyn_cast<Instruction>(PrevV);
20159 I && !ReplaceInst &&
20160 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20161 Builder.GetInsertPoint()->comesBefore(I)) {
20162 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20163 Builder.GetInsertPoint());
20164 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20165 CI->moveAfter(I);
20166 }
20167 Ex = PrevV;
20168 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20169 }
20170 }
20171 if (!Ex) {
20172 // "Reuse" the existing extract to improve final codegen.
20173 if (ReplaceInst) {
20174 // Leave the instruction as is, if it cheaper extracts and all
20175 // operands are scalar.
20176 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20177 IgnoredExtracts.insert(EE);
20178 Ex = EE;
20179 } else {
20180 auto *CloneInst = Inst->clone();
20181 CloneInst->insertBefore(Inst->getIterator());
20182 if (Inst->hasName())
20183 CloneInst->takeName(Inst);
20184 Ex = CloneInst;
20185 }
20186 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20187 ES && isa<Instruction>(Vec)) {
20188 Value *V = ES->getVectorOperand();
20189 auto *IVec = cast<Instruction>(Vec);
20190 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
20191 V = ETEs.front()->VectorizedValue;
20192 if (auto *IV = dyn_cast<Instruction>(V);
20193 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
20194 IV->comesBefore(IVec))
20195 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20196 else
20197 Ex = Builder.CreateExtractElement(Vec, Lane);
20198 } else if (auto *VecTy =
20199 dyn_cast<FixedVectorType>(Scalar->getType())) {
20200 assert(SLPReVec && "FixedVectorType is not expected.");
20201 unsigned VecTyNumElements = VecTy->getNumElements();
20202 // When REVEC is enabled, we need to extract a vector.
20203 // Note: The element size of Scalar may be different from the
20204 // element size of Vec.
20205 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
20206 ExternalUse.Lane * VecTyNumElements);
20207 } else {
20208 Ex = Builder.CreateExtractElement(Vec, Lane);
20209 }
20210 // If necessary, sign-extend or zero-extend ScalarRoot
20211 // to the larger type.
20212 ExV = Ex;
20213 if (Scalar->getType() != Ex->getType())
20214 ExV = Builder.CreateIntCast(
20215 Ex, Scalar->getType(),
20216 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
20217 auto *I = dyn_cast<Instruction>(Ex);
20218 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
20219 : &F->getEntryBlock(),
20220 std::make_pair(Ex, ExV));
20221 }
20222 // The then branch of the previous if may produce constants, since 0
20223 // operand might be a constant.
20224 if (auto *ExI = dyn_cast<Instruction>(Ex);
20225 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
20226 GatherShuffleExtractSeq.insert(ExI);
20227 CSEBlocks.insert(ExI->getParent());
20228 }
20229 return ExV;
20230 }
20231 assert(isa<FixedVectorType>(Scalar->getType()) &&
20232 isa<InsertElementInst>(Scalar) &&
20233 "In-tree scalar of vector type is not insertelement?");
20234 auto *IE = cast<InsertElementInst>(Scalar);
20235 VectorToInsertElement.try_emplace(Vec, IE);
20236 return Vec;
20237 };
20238 // If User == nullptr, the Scalar remains as scalar in vectorized
20239 // instructions or is used as extra arg. Generate ExtractElement instruction
20240 // and update the record for this scalar in ExternallyUsedValues.
20241 if (!User) {
20242 if (!ScalarsWithNullptrUser.insert(Scalar).second)
20243 continue;
20244 assert(
20245 (ExternallyUsedValues.count(Scalar) ||
20246 ExternalUsesWithNonUsers.count(Scalar) ||
20247 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20248 any_of(
20249 Scalar->users(),
20250 [&, TTI = TTI](llvm::User *U) {
20251 if (ExternalUsesAsOriginalScalar.contains(U))
20252 return true;
20253 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20254 return !UseEntries.empty() &&
20255 (E->State == TreeEntry::Vectorize ||
20256 E->State == TreeEntry::StridedVectorize ||
20257 E->State == TreeEntry::CompressVectorize) &&
20258 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20259 return (UseEntry->State == TreeEntry::Vectorize ||
20260 UseEntry->State ==
20261 TreeEntry::StridedVectorize ||
20262 UseEntry->State ==
20263 TreeEntry::CompressVectorize) &&
20264 doesInTreeUserNeedToExtract(
20265 Scalar, getRootEntryInstruction(*UseEntry),
20266 TLI, TTI);
20267 });
20268 })) &&
20269 "Scalar with nullptr User must be registered in "
20270 "ExternallyUsedValues map or remain as scalar in vectorized "
20271 "instructions");
20272 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20273 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
20274 if (PHI->getParent()->isLandingPad())
20275 Builder.SetInsertPoint(
20276 PHI->getParent(),
20277 std::next(
20278 PHI->getParent()->getLandingPadInst()->getIterator()));
20279 else
20280 Builder.SetInsertPoint(PHI->getParent(),
20281 PHI->getParent()->getFirstNonPHIIt());
20282 } else {
20283 Builder.SetInsertPoint(VecI->getParent(),
20284 std::next(VecI->getIterator()));
20285 }
20286 } else {
20287 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20288 }
20289 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20290 // Required to update internally referenced instructions.
20291 if (Scalar != NewInst) {
20292 assert((!isa<ExtractElementInst>(Scalar) ||
20293 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
20294 "Extractelements should not be replaced.");
20295 Scalar->replaceAllUsesWith(NewInst);
20296 }
20297 continue;
20298 }
20299
20300 if (auto *VU = dyn_cast<InsertElementInst>(User);
20301 VU && VU->getOperand(1) == Scalar) {
20302 // Skip if the scalar is another vector op or Vec is not an instruction.
20303 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20304 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
20305 if (!UsedInserts.insert(VU).second)
20306 continue;
20307 // Need to use original vector, if the root is truncated.
20308 auto BWIt = MinBWs.find(E);
20309 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
20310 auto *ScalarTy = FTy->getElementType();
20311 auto Key = std::make_pair(Vec, ScalarTy);
20312 auto VecIt = VectorCasts.find(Key);
20313 if (VecIt == VectorCasts.end()) {
20314 IRBuilderBase::InsertPointGuard Guard(Builder);
20315 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
20316 if (IVec->getParent()->isLandingPad())
20317 Builder.SetInsertPoint(IVec->getParent(),
20318 std::next(IVec->getParent()
20319 ->getLandingPadInst()
20320 ->getIterator()));
20321 else
20322 Builder.SetInsertPoint(
20323 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20324 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
20325 Builder.SetInsertPoint(IVec->getNextNode());
20326 }
20327 Vec = Builder.CreateIntCast(
20328 Vec,
20330 ScalarTy,
20331 cast<FixedVectorType>(Vec->getType())->getNumElements()),
20332 BWIt->second.second);
20333 VectorCasts.try_emplace(Key, Vec);
20334 } else {
20335 Vec = VecIt->second;
20336 }
20337 }
20338
20339 std::optional<unsigned> InsertIdx = getElementIndex(VU);
20340 if (InsertIdx) {
20341 auto *It = find_if(
20342 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
20343 // Checks if 2 insertelements are from the same buildvector.
20344 InsertElementInst *VecInsert = Data.InsertElements.front();
20346 VU, VecInsert,
20347 [](InsertElementInst *II) { return II->getOperand(0); });
20348 });
20349 unsigned Idx = *InsertIdx;
20350 if (It == ShuffledInserts.end()) {
20351 (void)ShuffledInserts.emplace_back();
20352 It = std::next(ShuffledInserts.begin(),
20353 ShuffledInserts.size() - 1);
20354 }
20355 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
20356 if (Mask.empty())
20357 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
20358 Mask[Idx] = ExternalUse.Lane;
20359 It->InsertElements.push_back(cast<InsertElementInst>(User));
20360 continue;
20361 }
20362 }
20363 }
20364 }
20365
20366 // Generate extracts for out-of-tree users.
20367 // Find the insertion point for the extractelement lane.
20368 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20369 if (PHINode *PH = dyn_cast<PHINode>(User)) {
20370 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20371 if (PH->getIncomingValue(I) == Scalar) {
20372 Instruction *IncomingTerminator =
20373 PH->getIncomingBlock(I)->getTerminator();
20374 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20375 Builder.SetInsertPoint(VecI->getParent(),
20376 std::next(VecI->getIterator()));
20377 } else {
20378 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
20379 }
20380 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20381 PH->setOperand(I, NewInst);
20382 }
20383 }
20384 } else {
20385 Builder.SetInsertPoint(cast<Instruction>(User));
20386 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20387 User->replaceUsesOfWith(Scalar, NewInst);
20388 }
20389 } else {
20390 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20391 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20392 User->replaceUsesOfWith(Scalar, NewInst);
20393 }
20394
20395 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
20396 }
20397
20398 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20399 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
20400 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
20401 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
20402 for (int I = 0, E = Mask.size(); I < E; ++I) {
20403 if (Mask[I] < VF)
20404 CombinedMask1[I] = Mask[I];
20405 else
20406 CombinedMask2[I] = Mask[I] - VF;
20407 }
20408 ShuffleInstructionBuilder ShuffleBuilder(
20409 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
20410 ShuffleBuilder.add(V1, CombinedMask1);
20411 if (V2)
20412 ShuffleBuilder.add(V2, CombinedMask2);
20413 return ShuffleBuilder.finalize({}, {}, {});
20414 };
20415
20416 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
20417 bool ForSingleMask) {
20418 unsigned VF = Mask.size();
20419 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20420 if (VF != VecVF) {
20421 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
20422 Vec = CreateShuffle(Vec, nullptr, Mask);
20423 return std::make_pair(Vec, true);
20424 }
20425 if (!ForSingleMask) {
20426 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20427 for (unsigned I = 0; I < VF; ++I) {
20428 if (Mask[I] != PoisonMaskElem)
20429 ResizeMask[Mask[I]] = Mask[I];
20430 }
20431 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
20432 }
20433 }
20434
20435 return std::make_pair(Vec, false);
20436 };
20437 // Perform shuffling of the vectorize tree entries for better handling of
20438 // external extracts.
20439 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20440 // Find the first and the last instruction in the list of insertelements.
20441 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
20442 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
20443 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
20444 Builder.SetInsertPoint(LastInsert);
20445 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20447 MutableArrayRef(Vector.data(), Vector.size()),
20448 FirstInsert->getOperand(0),
20449 [](Value *Vec) {
20450 return cast<VectorType>(Vec->getType())
20451 ->getElementCount()
20452 .getKnownMinValue();
20453 },
20454 ResizeToVF,
20455 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20456 ArrayRef<Value *> Vals) {
20457 assert((Vals.size() == 1 || Vals.size() == 2) &&
20458 "Expected exactly 1 or 2 input values.");
20459 if (Vals.size() == 1) {
20460 // Do not create shuffle if the mask is a simple identity
20461 // non-resizing mask.
20462 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20463 ->getNumElements() ||
20464 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20465 return CreateShuffle(Vals.front(), nullptr, Mask);
20466 return Vals.front();
20467 }
20468 return CreateShuffle(Vals.front() ? Vals.front()
20469 : FirstInsert->getOperand(0),
20470 Vals.back(), Mask);
20471 });
20472 auto It = ShuffledInserts[I].InsertElements.rbegin();
20473 // Rebuild buildvector chain.
20474 InsertElementInst *II = nullptr;
20475 if (It != ShuffledInserts[I].InsertElements.rend())
20476 II = *It;
20478 while (It != ShuffledInserts[I].InsertElements.rend()) {
20479 assert(II && "Must be an insertelement instruction.");
20480 if (*It == II)
20481 ++It;
20482 else
20483 Inserts.push_back(cast<Instruction>(II));
20484 II = dyn_cast<InsertElementInst>(II->getOperand(0));
20485 }
20486 for (Instruction *II : reverse(Inserts)) {
20487 II->replaceUsesOfWith(II->getOperand(0), NewInst);
20488 if (auto *NewI = dyn_cast<Instruction>(NewInst))
20489 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
20490 II->moveAfter(NewI);
20491 NewInst = II;
20492 }
20493 LastInsert->replaceAllUsesWith(NewInst);
20494 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
20495 IE->replaceUsesOfWith(IE->getOperand(0),
20496 PoisonValue::get(IE->getOperand(0)->getType()));
20497 IE->replaceUsesOfWith(IE->getOperand(1),
20498 PoisonValue::get(IE->getOperand(1)->getType()));
20499 eraseInstruction(IE);
20500 }
20501 CSEBlocks.insert(LastInsert->getParent());
20502 }
20503
20504 SmallVector<Instruction *> RemovedInsts;
20505 // For each vectorized value:
20506 for (auto &TEPtr : VectorizableTree) {
20507 TreeEntry *Entry = TEPtr.get();
20508
20509 // No need to handle users of gathered values.
20510 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
20511 continue;
20512
20513 assert(Entry->VectorizedValue && "Can't find vectorizable value");
20514
20515 // For each lane:
20516 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
20517 Value *Scalar = Entry->Scalars[Lane];
20518
20519 if (Entry->getOpcode() == Instruction::GetElementPtr &&
20520 !isa<GetElementPtrInst>(Scalar))
20521 continue;
20522 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20523 EE && IgnoredExtracts.contains(EE))
20524 continue;
20525 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
20526 continue;
20527#ifndef NDEBUG
20528 Type *Ty = Scalar->getType();
20529 if (!Ty->isVoidTy()) {
20530 for (User *U : Scalar->users()) {
20531 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
20532
20533 // It is legal to delete users in the ignorelist.
20534 assert((isVectorized(U) ||
20535 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20538 "Deleting out-of-tree value");
20539 }
20540 }
20541#endif
20542 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
20543 auto *I = cast<Instruction>(Scalar);
20544 RemovedInsts.push_back(I);
20545 }
20546 }
20547
20548 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
20549 // new vector instruction.
20550 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20551 V->mergeDIAssignID(RemovedInsts);
20552
20553 // Clear up reduction references, if any.
20554 if (UserIgnoreList) {
20555 for (Instruction *I : RemovedInsts) {
20556 const TreeEntry *IE = getTreeEntries(I).front();
20557 if (IE->Idx != 0 &&
20558 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
20559 (ValueToGatherNodes.lookup(I).contains(
20560 VectorizableTree.front().get()) ||
20561 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20562 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20563 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20564 IE->UserTreeIndex &&
20565 is_contained(VectorizableTree.front()->Scalars, I)) &&
20566 !(GatheredLoadsEntriesFirst.has_value() &&
20567 IE->Idx >= *GatheredLoadsEntriesFirst &&
20568 VectorizableTree.front()->isGather() &&
20569 is_contained(VectorizableTree.front()->Scalars, I)) &&
20570 !(!VectorizableTree.front()->isGather() &&
20571 VectorizableTree.front()->isCopyableElement(I)))
20572 continue;
20573 SmallVector<SelectInst *> LogicalOpSelects;
20574 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
20575 // Do not replace condition of the logical op in form select <cond>.
20576 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20577 (match(U.getUser(), m_LogicalAnd()) ||
20578 match(U.getUser(), m_LogicalOr())) &&
20579 U.getOperandNo() == 0;
20580 if (IsPoisoningLogicalOp) {
20581 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20582 return false;
20583 }
20584 return UserIgnoreList->contains(U.getUser());
20585 });
20586 // Replace conditions of the poisoning logical ops with the non-poison
20587 // constant value.
20588 for (SelectInst *SI : LogicalOpSelects)
20589 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
20590 }
20591 }
20592 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
20593 // cache correctness.
20594 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
20595 // - instructions are not deleted until later.
20596 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
20597
20598 Builder.ClearInsertionPoint();
20599 InstrElementSize.clear();
20600
20601 const TreeEntry &RootTE = *VectorizableTree.front();
20602 Value *Vec = RootTE.VectorizedValue;
20603 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20604 It != MinBWs.end() &&
20605 ReductionBitWidth != It->second.first) {
20606 IRBuilder<>::InsertPointGuard Guard(Builder);
20607 Builder.SetInsertPoint(ReductionRoot->getParent(),
20608 ReductionRoot->getIterator());
20609 Vec = Builder.CreateIntCast(
20610 Vec,
20611 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20612 cast<VectorType>(Vec->getType())->getElementCount()),
20613 It->second.second);
20614 }
20615 return Vec;
20616}
20617
20619 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
20620 << " gather sequences instructions.\n");
20621 // LICM InsertElementInst sequences.
20622 for (Instruction *I : GatherShuffleExtractSeq) {
20623 if (isDeleted(I))
20624 continue;
20625
20626 // Check if this block is inside a loop.
20627 Loop *L = LI->getLoopFor(I->getParent());
20628 if (!L)
20629 continue;
20630
20631 // Check if it has a preheader.
20632 BasicBlock *PreHeader = L->getLoopPreheader();
20633 if (!PreHeader)
20634 continue;
20635
20636 // If the vector or the element that we insert into it are
20637 // instructions that are defined in this basic block then we can't
20638 // hoist this instruction.
20639 if (any_of(I->operands(), [L](Value *V) {
20640 auto *OpI = dyn_cast<Instruction>(V);
20641 return OpI && L->contains(OpI);
20642 }))
20643 continue;
20644
20645 // We can hoist this instruction. Move it to the pre-header.
20646 I->moveBefore(PreHeader->getTerminator()->getIterator());
20647 CSEBlocks.insert(PreHeader);
20648 }
20649
20650 // Make a list of all reachable blocks in our CSE queue.
20652 CSEWorkList.reserve(CSEBlocks.size());
20653 for (BasicBlock *BB : CSEBlocks)
20654 if (DomTreeNode *N = DT->getNode(BB)) {
20655 assert(DT->isReachableFromEntry(N));
20656 CSEWorkList.push_back(N);
20657 }
20658
20659 // Sort blocks by domination. This ensures we visit a block after all blocks
20660 // dominating it are visited.
20661 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
20662 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
20663 "Different nodes should have different DFS numbers");
20664 return A->getDFSNumIn() < B->getDFSNumIn();
20665 });
20666
20667 // Less defined shuffles can be replaced by the more defined copies.
20668 // Between two shuffles one is less defined if it has the same vector operands
20669 // and its mask indeces are the same as in the first one or undefs. E.g.
20670 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
20671 // poison, <0, 0, 0, 0>.
20672 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
20673 Instruction *I2,
20674 SmallVectorImpl<int> &NewMask) {
20675 if (I1->getType() != I2->getType())
20676 return false;
20677 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
20678 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
20679 if (!SI1 || !SI2)
20680 return I1->isIdenticalTo(I2);
20681 if (SI1->isIdenticalTo(SI2))
20682 return true;
20683 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
20684 if (SI1->getOperand(I) != SI2->getOperand(I))
20685 return false;
20686 // Check if the second instruction is more defined than the first one.
20687 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20688 ArrayRef<int> SM1 = SI1->getShuffleMask();
20689 // Count trailing undefs in the mask to check the final number of used
20690 // registers.
20691 unsigned LastUndefsCnt = 0;
20692 for (int I = 0, E = NewMask.size(); I < E; ++I) {
20693 if (SM1[I] == PoisonMaskElem)
20694 ++LastUndefsCnt;
20695 else
20696 LastUndefsCnt = 0;
20697 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
20698 NewMask[I] != SM1[I])
20699 return false;
20700 if (NewMask[I] == PoisonMaskElem)
20701 NewMask[I] = SM1[I];
20702 }
20703 // Check if the last undefs actually change the final number of used vector
20704 // registers.
20705 return SM1.size() - LastUndefsCnt > 1 &&
20706 ::getNumberOfParts(*TTI, SI1->getType()) ==
20708 *TTI, getWidenedType(SI1->getType()->getElementType(),
20709 SM1.size() - LastUndefsCnt));
20710 };
20711 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
20712 // instructions. TODO: We can further optimize this scan if we split the
20713 // instructions into different buckets based on the insert lane.
20715 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
20716 assert(*I &&
20717 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
20718 "Worklist not sorted properly!");
20719 BasicBlock *BB = (*I)->getBlock();
20720 // For all instructions in blocks containing gather sequences:
20721 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
20722 if (isDeleted(&In))
20723 continue;
20725 !GatherShuffleExtractSeq.contains(&In))
20726 continue;
20727
20728 // Check if we can replace this instruction with any of the
20729 // visited instructions.
20730 bool Replaced = false;
20731 for (Instruction *&V : Visited) {
20732 SmallVector<int> NewMask;
20733 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20734 DT->dominates(V->getParent(), In.getParent())) {
20735 In.replaceAllUsesWith(V);
20736 eraseInstruction(&In);
20737 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
20738 if (!NewMask.empty())
20739 SI->setShuffleMask(NewMask);
20740 Replaced = true;
20741 break;
20742 }
20744 GatherShuffleExtractSeq.contains(V) &&
20745 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20746 DT->dominates(In.getParent(), V->getParent())) {
20747 In.moveAfter(V);
20748 V->replaceAllUsesWith(&In);
20750 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
20751 if (!NewMask.empty())
20752 SI->setShuffleMask(NewMask);
20753 V = &In;
20754 Replaced = true;
20755 break;
20756 }
20757 }
20758 if (!Replaced) {
20759 assert(!is_contained(Visited, &In));
20760 Visited.push_back(&In);
20761 }
20762 }
20763 }
20764 CSEBlocks.clear();
20765 GatherShuffleExtractSeq.clear();
20766}
20767
20768BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20769 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
20770 auto &BundlePtr =
20771 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20772 for (Value *V : VL) {
20773 if (S.isNonSchedulable(V))
20774 continue;
20775 auto *I = cast<Instruction>(V);
20776 if (S.isCopyableElement(V)) {
20777 // Add a copyable element model.
20778 ScheduleCopyableData &SD =
20779 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
20780 // Group the instructions to a bundle.
20781 BundlePtr->add(&SD);
20782 continue;
20783 }
20784 ScheduleData *BundleMember = getScheduleData(V);
20785 assert(BundleMember && "no ScheduleData for bundle member "
20786 "(maybe not in same basic block)");
20787 // Group the instructions to a bundle.
20788 BundlePtr->add(BundleMember);
20789 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
20790 BundlePtr.get());
20791 }
20792 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
20793 return *BundlePtr;
20794}
20795
20796// Groups the instructions to a bundle (which is then a single scheduling entity)
20797// and schedules instructions until the bundle gets ready.
20798std::optional<BoUpSLP::ScheduleBundle *>
20799BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
20800 const InstructionsState &S,
20801 const EdgeInfo &EI) {
20802 // No need to schedule PHIs, insertelement, extractelement and extractvalue
20803 // instructions.
20804 if (isa<PHINode>(S.getMainOp()) ||
20805 isVectorLikeInstWithConstOps(S.getMainOp()))
20806 return nullptr;
20807 bool HasCopyables = S.areInstructionsWithCopyableElements();
20808 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
20809 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
20810 // If all operands were replaced by copyables, the operands of this node
20811 // might be not, so need to recalculate dependencies for schedule data,
20812 // replaced by copyable schedule data.
20813 SmallVector<ScheduleData *> ControlDependentMembers;
20814 for (Value *V : VL) {
20815 auto *I = dyn_cast<Instruction>(V);
20816 if (!I || (HasCopyables && S.isCopyableElement(V)))
20817 continue;
20818 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
20819 for (const Use &U : I->operands()) {
20820 unsigned &NumOps =
20821 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
20822 .first->getSecond();
20823 ++NumOps;
20824 if (auto *Op = dyn_cast<Instruction>(U.get());
20825 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
20826 if (ScheduleData *OpSD = getScheduleData(Op);
20827 OpSD && OpSD->hasValidDependencies()) {
20828 OpSD->clearDirectDependencies();
20829 if (RegionHasStackSave ||
20831 ControlDependentMembers.push_back(OpSD);
20832 }
20833 }
20834 }
20835 }
20836 if (!ControlDependentMembers.empty()) {
20837 ScheduleBundle Invalid = ScheduleBundle::invalid();
20838 calculateDependencies(Invalid, /*InsertInReadyList=*/true, SLP,
20839 ControlDependentMembers);
20840 }
20841 return nullptr;
20842 }
20843
20844 // Initialize the instruction bundle.
20845 Instruction *OldScheduleEnd = ScheduleEnd;
20846 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
20847
20848 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
20849 // Clear deps or recalculate the region, if the memory instruction is a
20850 // copyable. It may have memory deps, which must be recalculated.
20851 SmallVector<ScheduleData *> ControlDependentMembers;
20852 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20853 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
20854 for (ScheduleEntity *SE : Bundle.getBundle()) {
20855 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
20856 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20857 BundleMember && BundleMember->hasValidDependencies()) {
20858 BundleMember->clearDirectDependencies();
20859 if (RegionHasStackSave ||
20861 BundleMember->getInst()))
20862 ControlDependentMembers.push_back(BundleMember);
20863 }
20864 continue;
20865 }
20866 auto *SD = cast<ScheduleData>(SE);
20867 if (SD->hasValidDependencies() &&
20868 (!S.areInstructionsWithCopyableElements() ||
20869 !S.isCopyableElement(SD->getInst())) &&
20870 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
20871 EI.UserTE->hasState() &&
20872 (!EI.UserTE->hasCopyableElements() ||
20873 !EI.UserTE->isCopyableElement(SD->getInst())))
20874 SD->clearDirectDependencies();
20875 for (const Use &U : SD->getInst()->operands()) {
20876 unsigned &NumOps =
20877 UserOpToNumOps
20878 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
20879 .first->getSecond();
20880 ++NumOps;
20881 if (auto *Op = dyn_cast<Instruction>(U.get());
20882 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
20883 *SLP, NumOps)) {
20884 if (ScheduleData *OpSD = getScheduleData(Op);
20885 OpSD && OpSD->hasValidDependencies()) {
20886 OpSD->clearDirectDependencies();
20887 if (RegionHasStackSave ||
20889 ControlDependentMembers.push_back(OpSD);
20890 }
20891 }
20892 }
20893 }
20894 };
20895 // The scheduling region got new instructions at the lower end (or it is a
20896 // new region for the first bundle). This makes it necessary to
20897 // recalculate all dependencies.
20898 // It is seldom that this needs to be done a second time after adding the
20899 // initial bundle to the region.
20900 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20901 for_each(ScheduleDataMap, [&](auto &P) {
20902 if (BB != P.first->getParent())
20903 return;
20904 ScheduleData *SD = P.second;
20905 if (isInSchedulingRegion(*SD))
20906 SD->clearDependencies();
20907 });
20908 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
20909 for_each(P.second, [&](ScheduleCopyableData *SD) {
20910 if (isInSchedulingRegion(*SD))
20911 SD->clearDependencies();
20912 });
20913 });
20914 ReSchedule = true;
20915 }
20916 // Check if the bundle data has deps for copyable elements already. In
20917 // this case need to reset deps and recalculate it.
20918 if (Bundle && !Bundle.getBundle().empty()) {
20919 if (S.areInstructionsWithCopyableElements() ||
20920 !ScheduleCopyableDataMap.empty())
20921 CheckIfNeedToClearDeps(Bundle);
20922 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
20923 << BB->getName() << "\n");
20924 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
20925 ControlDependentMembers);
20926 } else if (!ControlDependentMembers.empty()) {
20927 ScheduleBundle Invalid = ScheduleBundle::invalid();
20928 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
20929 ControlDependentMembers);
20930 }
20931
20932 if (ReSchedule) {
20933 resetSchedule();
20934 initialFillReadyList(ReadyInsts);
20935 }
20936
20937 // Now try to schedule the new bundle or (if no bundle) just calculate
20938 // dependencies. As soon as the bundle is "ready" it means that there are no
20939 // cyclic dependencies and we can schedule it. Note that's important that we
20940 // don't "schedule" the bundle yet.
20941 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20942 !ReadyInsts.empty()) {
20943 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20944 assert(Picked->isReady() && "must be ready to schedule");
20945 schedule(*SLP, S, EI, Picked, ReadyInsts);
20946 if (Picked == &Bundle)
20947 break;
20948 }
20949 };
20950
20951 // Make sure that the scheduling region contains all
20952 // instructions of the bundle.
20953 for (Value *V : VL) {
20954 if (S.isNonSchedulable(V))
20955 continue;
20956 if (!extendSchedulingRegion(V, S)) {
20957 // If the scheduling region got new instructions at the lower end (or it
20958 // is a new region for the first bundle). This makes it necessary to
20959 // recalculate all dependencies.
20960 // Otherwise the compiler may crash trying to incorrectly calculate
20961 // dependencies and emit instruction in the wrong order at the actual
20962 // scheduling.
20963 ScheduleBundle Invalid = ScheduleBundle::invalid();
20964 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
20965 return std::nullopt;
20966 }
20967 }
20968
20969 bool ReSchedule = false;
20970 for (Value *V : VL) {
20971 if (S.isNonSchedulable(V))
20972 continue;
20974 getScheduleCopyableData(cast<Instruction>(V));
20975 if (!CopyableData.empty()) {
20976 for (ScheduleCopyableData *SD : CopyableData)
20977 ReadyInsts.remove(SD);
20978 }
20979 ScheduleData *BundleMember = getScheduleData(V);
20980 assert((BundleMember || S.isCopyableElement(V)) &&
20981 "no ScheduleData for bundle member (maybe not in same basic block)");
20982 if (!BundleMember)
20983 continue;
20984
20985 // Make sure we don't leave the pieces of the bundle in the ready list when
20986 // whole bundle might not be ready.
20987 ReadyInsts.remove(BundleMember);
20988 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
20989 !Bundles.empty()) {
20990 for (ScheduleBundle *B : Bundles)
20991 ReadyInsts.remove(B);
20992 }
20993
20994 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
20995 continue;
20996 // A bundle member was scheduled as single instruction before and now
20997 // needs to be scheduled as part of the bundle. We just get rid of the
20998 // existing schedule.
20999 // A bundle member has deps calculated before it was copyable element - need
21000 // to reschedule.
21001 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
21002 << " was already scheduled\n");
21003 ReSchedule = true;
21004 }
21005
21006 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21007 TryScheduleBundleImpl(ReSchedule, Bundle);
21008 if (!Bundle.isReady()) {
21009 for (ScheduleEntity *BD : Bundle.getBundle()) {
21010 // Copyable data scheduling is just removed.
21012 continue;
21013 if (BD->isReady()) {
21014 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
21015 if (Bundles.empty()) {
21016 ReadyInsts.insert(BD);
21017 continue;
21018 }
21019 for (ScheduleBundle *B : Bundles)
21020 if (B->isReady())
21021 ReadyInsts.insert(B);
21022 }
21023 }
21024 ScheduledBundlesList.pop_back();
21025 SmallVector<ScheduleData *> ControlDependentMembers;
21026 SmallPtrSet<Instruction *, 4> Visited;
21027 for (Value *V : VL) {
21028 if (S.isNonSchedulable(V))
21029 continue;
21030 auto *I = cast<Instruction>(V);
21031 if (S.isCopyableElement(I)) {
21032 // Remove the copyable data from the scheduling region and restore
21033 // previous mappings.
21034 auto KV = std::make_pair(EI, I);
21035 assert(ScheduleCopyableDataMap.contains(KV) &&
21036 "no ScheduleCopyableData for copyable element");
21037 ScheduleCopyableData *SD =
21038 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
21039 ScheduleCopyableDataMapByUsers[I].remove(SD);
21040 if (EI.UserTE) {
21041 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21042 const auto *It = find(Op, I);
21043 assert(It != Op.end() && "Lane not set");
21044 SmallPtrSet<Instruction *, 4> Visited;
21045 do {
21046 int Lane = std::distance(Op.begin(), It);
21047 assert(Lane >= 0 && "Lane not set");
21048 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21049 !EI.UserTE->ReorderIndices.empty())
21050 Lane = EI.UserTE->ReorderIndices[Lane];
21051 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21052 "Couldn't find extract lane");
21053 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21054 if (!Visited.insert(In).second) {
21055 It = find(make_range(std::next(It), Op.end()), I);
21056 break;
21057 }
21058 ScheduleCopyableDataMapByInstUser
21059 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
21060 .pop_back();
21061 It = find(make_range(std::next(It), Op.end()), I);
21062 } while (It != Op.end());
21063 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
21064 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
21065 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
21066 }
21067 if (ScheduleCopyableDataMapByUsers[I].empty())
21068 ScheduleCopyableDataMapByUsers.erase(I);
21069 ScheduleCopyableDataMap.erase(KV);
21070 // Need to recalculate dependencies for the actual schedule data.
21071 if (ScheduleData *OpSD = getScheduleData(I);
21072 OpSD && OpSD->hasValidDependencies()) {
21073 OpSD->clearDirectDependencies();
21074 if (RegionHasStackSave ||
21076 ControlDependentMembers.push_back(OpSD);
21077 }
21078 continue;
21079 }
21080 ScheduledBundles.find(I)->getSecond().pop_back();
21081 }
21082 if (!ControlDependentMembers.empty()) {
21083 ScheduleBundle Invalid = ScheduleBundle::invalid();
21084 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
21085 ControlDependentMembers);
21086 }
21087 return std::nullopt;
21088 }
21089 return &Bundle;
21090}
21091
21092BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21093 // Allocate a new ScheduleData for the instruction.
21094 if (ChunkPos >= ChunkSize) {
21095 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21096 ChunkPos = 0;
21097 }
21098 return &(ScheduleDataChunks.back()[ChunkPos++]);
21099}
21100
21101bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21102 Value *V, const InstructionsState &S) {
21104 assert(I && "bundle member must be an instruction");
21105 if (getScheduleData(I))
21106 return true;
21107 if (!ScheduleStart) {
21108 // It's the first instruction in the new region.
21109 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
21110 ScheduleStart = I;
21111 ScheduleEnd = I->getNextNode();
21112 assert(ScheduleEnd && "tried to vectorize a terminator?");
21113 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
21114 return true;
21115 }
21116 // Search up and down at the same time, because we don't know if the new
21117 // instruction is above or below the existing scheduling region.
21118 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
21119 // against the budget. Otherwise debug info could affect codegen.
21121 ++ScheduleStart->getIterator().getReverse();
21122 BasicBlock::reverse_iterator UpperEnd = BB->rend();
21123 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
21124 BasicBlock::iterator LowerEnd = BB->end();
21125 auto IsAssumeLikeIntr = [](const Instruction &I) {
21126 if (auto *II = dyn_cast<IntrinsicInst>(&I))
21127 return II->isAssumeLikeIntrinsic();
21128 return false;
21129 };
21130 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21131 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21132 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
21133 &*DownIter != I) {
21134 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21135 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
21136 return false;
21137 }
21138
21139 ++UpIter;
21140 ++DownIter;
21141
21142 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21143 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21144 }
21145 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
21146 assert(I->getParent() == ScheduleStart->getParent() &&
21147 "Instruction is in wrong basic block.");
21148 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
21149 ScheduleStart = I;
21150 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
21151 << "\n");
21152 return true;
21153 }
21154 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
21155 "Expected to reach top of the basic block or instruction down the "
21156 "lower end.");
21157 assert(I->getParent() == ScheduleEnd->getParent() &&
21158 "Instruction is in wrong basic block.");
21159 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
21160 nullptr);
21161 ScheduleEnd = I->getNextNode();
21162 assert(ScheduleEnd && "tried to vectorize a terminator?");
21163 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
21164 return true;
21165}
21166
21167void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21168 Instruction *ToI,
21169 ScheduleData *PrevLoadStore,
21170 ScheduleData *NextLoadStore) {
21171 ScheduleData *CurrentLoadStore = PrevLoadStore;
21172 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
21173 // No need to allocate data for non-schedulable instructions.
21174 if (isa<PHINode>(I))
21175 continue;
21176 ScheduleData *SD = ScheduleDataMap.lookup(I);
21177 if (!SD) {
21178 SD = allocateScheduleDataChunks();
21179 ScheduleDataMap[I] = SD;
21180 }
21181 assert(!isInSchedulingRegion(*SD) &&
21182 "new ScheduleData already in scheduling region");
21183 SD->init(SchedulingRegionID, I);
21184
21185 if (I->mayReadOrWriteMemory() &&
21186 (!isa<IntrinsicInst>(I) ||
21187 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
21188 cast<IntrinsicInst>(I)->getIntrinsicID() !=
21189 Intrinsic::pseudoprobe))) {
21190 // Update the linked list of memory accessing instructions.
21191 if (CurrentLoadStore) {
21192 CurrentLoadStore->setNextLoadStore(SD);
21193 } else {
21194 FirstLoadStoreInRegion = SD;
21195 }
21196 CurrentLoadStore = SD;
21197 }
21198
21201 RegionHasStackSave = true;
21202 }
21203 if (NextLoadStore) {
21204 if (CurrentLoadStore)
21205 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21206 } else {
21207 LastLoadStoreInRegion = CurrentLoadStore;
21208 }
21209}
21210
21211void BoUpSLP::BlockScheduling::calculateDependencies(
21212 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
21213 ArrayRef<ScheduleData *> ControlDeps) {
21214 SmallVector<ScheduleEntity *> WorkList;
21215 auto ProcessNode = [&](ScheduleEntity *SE) {
21216 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21217 if (CD->hasValidDependencies())
21218 return;
21219 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
21220 CD->initDependencies();
21221 CD->resetUnscheduledDeps();
21222 const EdgeInfo &EI = CD->getEdgeInfo();
21223 if (EI.UserTE) {
21224 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21225 const auto *It = find(Op, CD->getInst());
21226 assert(It != Op.end() && "Lane not set");
21227 SmallPtrSet<Instruction *, 4> Visited;
21228 do {
21229 int Lane = std::distance(Op.begin(), It);
21230 assert(Lane >= 0 && "Lane not set");
21231 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21232 !EI.UserTE->ReorderIndices.empty())
21233 Lane = EI.UserTE->ReorderIndices[Lane];
21234 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21235 "Couldn't find extract lane");
21236 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21237 if (EI.UserTE->isCopyableElement(In)) {
21238 // We may have not have related copyable scheduling data, if the
21239 // instruction is non-schedulable.
21240 if (ScheduleCopyableData *UseSD =
21241 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21242 CD->incDependencies();
21243 if (!UseSD->isScheduled())
21244 CD->incrementUnscheduledDeps(1);
21245 if (!UseSD->hasValidDependencies() ||
21246 (InsertInReadyList && UseSD->isReady()))
21247 WorkList.push_back(UseSD);
21248 }
21249 } else if (Visited.insert(In).second) {
21250 if (ScheduleData *UseSD = getScheduleData(In)) {
21251 CD->incDependencies();
21252 if (!UseSD->isScheduled())
21253 CD->incrementUnscheduledDeps(1);
21254 if (!UseSD->hasValidDependencies() ||
21255 (InsertInReadyList && UseSD->isReady()))
21256 WorkList.push_back(UseSD);
21257 }
21258 }
21259 It = find(make_range(std::next(It), Op.end()), CD->getInst());
21260 } while (It != Op.end());
21261 if (CD->isReady() && CD->getDependencies() == 0 &&
21262 (EI.UserTE->hasState() &&
21263 (EI.UserTE->getMainOp()->getParent() !=
21264 CD->getInst()->getParent() ||
21265 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21266 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
21267 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21268 auto *IU = dyn_cast<Instruction>(U);
21269 if (!IU)
21270 return true;
21271 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21272 })))))) {
21273 // If no uses in the block - mark as having pseudo-use, which cannot
21274 // be scheduled.
21275 // Prevents incorrect def-use tracking between external user and
21276 // actual instruction.
21277 CD->incDependencies();
21278 CD->incrementUnscheduledDeps(1);
21279 }
21280 }
21281 return;
21282 }
21283 auto *BundleMember = cast<ScheduleData>(SE);
21284 if (BundleMember->hasValidDependencies())
21285 return;
21286 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
21287 BundleMember->initDependencies();
21288 BundleMember->resetUnscheduledDeps();
21289 // Handle def-use chain dependencies.
21290 SmallDenseMap<Value *, unsigned> UserToNumOps;
21291 for (User *U : BundleMember->getInst()->users()) {
21292 if (isa<PHINode>(U))
21293 continue;
21294 if (ScheduleData *UseSD = getScheduleData(U)) {
21295 // The operand is a copyable element - skip.
21296 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
21297 ++NumOps;
21298 if (areAllOperandsReplacedByCopyableData(
21299 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21300 continue;
21301 BundleMember->incDependencies();
21302 if (!UseSD->isScheduled())
21303 BundleMember->incrementUnscheduledDeps(1);
21304 if (!UseSD->hasValidDependencies() ||
21305 (InsertInReadyList && UseSD->isReady()))
21306 WorkList.push_back(UseSD);
21307 }
21308 }
21309 for (ScheduleCopyableData *UseSD :
21310 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21311 BundleMember->incDependencies();
21312 if (!UseSD->isScheduled())
21313 BundleMember->incrementUnscheduledDeps(1);
21314 if (!UseSD->hasValidDependencies() ||
21315 (InsertInReadyList && UseSD->isReady()))
21316 WorkList.push_back(UseSD);
21317 }
21318
21319 SmallPtrSet<const Instruction *, 4> Visited;
21320 auto MakeControlDependent = [&](Instruction *I) {
21321 // Do not mark control dependent twice.
21322 if (!Visited.insert(I).second)
21323 return;
21324 auto *DepDest = getScheduleData(I);
21325 assert(DepDest && "must be in schedule window");
21326 DepDest->addControlDependency(BundleMember);
21327 BundleMember->incDependencies();
21328 if (!DepDest->isScheduled())
21329 BundleMember->incrementUnscheduledDeps(1);
21330 if (!DepDest->hasValidDependencies() ||
21331 (InsertInReadyList && DepDest->isReady()))
21332 WorkList.push_back(DepDest);
21333 };
21334
21335 // Any instruction which isn't safe to speculate at the beginning of the
21336 // block is control depend on any early exit or non-willreturn call
21337 // which proceeds it.
21338 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
21339 for (Instruction *I = BundleMember->getInst()->getNextNode();
21340 I != ScheduleEnd; I = I->getNextNode()) {
21341 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
21342 continue;
21343
21344 // Add the dependency
21345 MakeControlDependent(I);
21346
21348 // Everything past here must be control dependent on I.
21349 break;
21350 }
21351 }
21352
21353 if (RegionHasStackSave) {
21354 // If we have an inalloc alloca instruction, it needs to be scheduled
21355 // after any preceeding stacksave. We also need to prevent any alloca
21356 // from reordering above a preceeding stackrestore.
21357 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21358 match(BundleMember->getInst(),
21360 for (Instruction *I = BundleMember->getInst()->getNextNode();
21361 I != ScheduleEnd; I = I->getNextNode()) {
21364 // Any allocas past here must be control dependent on I, and I
21365 // must be memory dependend on BundleMember->Inst.
21366 break;
21367
21368 if (!isa<AllocaInst>(I))
21369 continue;
21370
21371 // Add the dependency
21372 MakeControlDependent(I);
21373 }
21374 }
21375
21376 // In addition to the cases handle just above, we need to prevent
21377 // allocas and loads/stores from moving below a stacksave or a
21378 // stackrestore. Avoiding moving allocas below stackrestore is currently
21379 // thought to be conservatism. Moving loads/stores below a stackrestore
21380 // can lead to incorrect code.
21381 if (isa<AllocaInst>(BundleMember->getInst()) ||
21382 BundleMember->getInst()->mayReadOrWriteMemory()) {
21383 for (Instruction *I = BundleMember->getInst()->getNextNode();
21384 I != ScheduleEnd; I = I->getNextNode()) {
21387 continue;
21388
21389 // Add the dependency
21390 MakeControlDependent(I);
21391 break;
21392 }
21393 }
21394 }
21395
21396 // Handle the memory dependencies (if any).
21397 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21398 if (!NextLoadStore)
21399 return;
21400 Instruction *SrcInst = BundleMember->getInst();
21401 assert(SrcInst->mayReadOrWriteMemory() &&
21402 "NextLoadStore list for non memory effecting bundle?");
21403 MemoryLocation SrcLoc = getLocation(SrcInst);
21404 bool SrcMayWrite = SrcInst->mayWriteToMemory();
21405 unsigned NumAliased = 0;
21406 unsigned DistToSrc = 1;
21407 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
21408
21409 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21410 DepDest = DepDest->getNextLoadStore()) {
21411 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
21412
21413 // We have two limits to reduce the complexity:
21414 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
21415 // SLP->isAliased (which is the expensive part in this loop).
21416 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
21417 // the whole loop (even if the loop is fast, it's quadratic).
21418 // It's important for the loop break condition (see below) to
21419 // check this limit even between two read-only instructions.
21420 if (DistToSrc >= MaxMemDepDistance ||
21421 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21422 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
21423 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21424
21425 // We increment the counter only if the locations are aliased
21426 // (instead of counting all alias checks). This gives a better
21427 // balance between reduced runtime and accurate dependencies.
21428 NumAliased++;
21429
21430 DepDest->addMemoryDependency(BundleMember);
21431 BundleMember->incDependencies();
21432 if (!DepDest->isScheduled())
21433 BundleMember->incrementUnscheduledDeps(1);
21434 if (!DepDest->hasValidDependencies() ||
21435 (InsertInReadyList && DepDest->isReady()))
21436 WorkList.push_back(DepDest);
21437 }
21438
21439 // Example, explaining the loop break condition: Let's assume our
21440 // starting instruction is i0 and MaxMemDepDistance = 3.
21441 //
21442 // +--------v--v--v
21443 // i0,i1,i2,i3,i4,i5,i6,i7,i8
21444 // +--------^--^--^
21445 //
21446 // MaxMemDepDistance let us stop alias-checking at i3 and we add
21447 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
21448 // Previously we already added dependencies from i3 to i6,i7,i8
21449 // (because of MaxMemDepDistance). As we added a dependency from
21450 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
21451 // and we can abort this loop at i6.
21452 if (DistToSrc >= 2 * MaxMemDepDistance)
21453 break;
21454 DistToSrc++;
21455 }
21456 };
21457
21458 assert((Bundle || !ControlDeps.empty()) &&
21459 "expected at least one instruction to schedule");
21460 if (Bundle)
21461 WorkList.push_back(Bundle.getBundle().front());
21462 WorkList.append(ControlDeps.begin(), ControlDeps.end());
21463 SmallPtrSet<ScheduleBundle *, 16> Visited;
21464 while (!WorkList.empty()) {
21465 ScheduleEntity *SD = WorkList.pop_back_val();
21466 SmallVector<ScheduleBundle *, 1> CopyableBundle;
21468 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21469 CopyableBundle.push_back(&CD->getBundle());
21470 Bundles = CopyableBundle;
21471 } else {
21472 Bundles = getScheduleBundles(SD->getInst());
21473 }
21474 if (Bundles.empty()) {
21475 if (!SD->hasValidDependencies())
21476 ProcessNode(SD);
21477 if (InsertInReadyList && SD->isReady()) {
21478 ReadyInsts.insert(SD);
21479 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
21480 }
21481 continue;
21482 }
21483 for (ScheduleBundle *Bundle : Bundles) {
21484 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
21485 continue;
21486 assert(isInSchedulingRegion(*Bundle) &&
21487 "ScheduleData not in scheduling region");
21488 for_each(Bundle->getBundle(), ProcessNode);
21489 }
21490 if (InsertInReadyList && SD->isReady()) {
21491 for (ScheduleBundle *Bundle : Bundles) {
21492 assert(isInSchedulingRegion(*Bundle) &&
21493 "ScheduleData not in scheduling region");
21494 if (!Bundle->isReady())
21495 continue;
21496 ReadyInsts.insert(Bundle);
21497 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
21498 << "\n");
21499 }
21500 }
21501 }
21502}
21503
21504void BoUpSLP::BlockScheduling::resetSchedule() {
21505 assert(ScheduleStart &&
21506 "tried to reset schedule on block which has not been scheduled");
21507 for_each(ScheduleDataMap, [&](auto &P) {
21508 if (BB != P.first->getParent())
21509 return;
21510 ScheduleData *SD = P.second;
21511 if (isInSchedulingRegion(*SD)) {
21512 SD->setScheduled(/*Scheduled=*/false);
21513 SD->resetUnscheduledDeps();
21514 }
21515 });
21516 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21517 for_each(P.second, [&](ScheduleCopyableData *SD) {
21518 if (isInSchedulingRegion(*SD)) {
21519 SD->setScheduled(/*Scheduled=*/false);
21520 SD->resetUnscheduledDeps();
21521 }
21522 });
21523 });
21524 for_each(ScheduledBundles, [&](auto &P) {
21525 for_each(P.second, [&](ScheduleBundle *Bundle) {
21526 if (isInSchedulingRegion(*Bundle))
21527 Bundle->setScheduled(/*Scheduled=*/false);
21528 });
21529 });
21530 // Reset schedule data for copyable elements.
21531 for (auto &P : ScheduleCopyableDataMap) {
21532 if (isInSchedulingRegion(*P.second)) {
21533 P.second->setScheduled(/*Scheduled=*/false);
21534 P.second->resetUnscheduledDeps();
21535 }
21536 }
21537 ReadyInsts.clear();
21538}
21539
21540void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
21541 if (!BS->ScheduleStart)
21542 return;
21543
21544 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
21545
21546 // A key point - if we got here, pre-scheduling was able to find a valid
21547 // scheduling of the sub-graph of the scheduling window which consists
21548 // of all vector bundles and their transitive users. As such, we do not
21549 // need to reschedule anything *outside of* that subgraph.
21550
21551 BS->resetSchedule();
21552
21553 // For the real scheduling we use a more sophisticated ready-list: it is
21554 // sorted by the original instruction location. This lets the final schedule
21555 // be as close as possible to the original instruction order.
21556 // WARNING: If changing this order causes a correctness issue, that means
21557 // there is some missing dependence edge in the schedule data graph.
21558 struct ScheduleDataCompare {
21559 bool operator()(const ScheduleEntity *SD1,
21560 const ScheduleEntity *SD2) const {
21561 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21562 }
21563 };
21564 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21565
21566 // Ensure that all dependency data is updated (for nodes in the sub-graph)
21567 // and fill the ready-list with initial instructions.
21568 int Idx = 0;
21569 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21570 I = I->getNextNode()) {
21571 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21572 if (!Bundles.empty()) {
21573 for (ScheduleBundle *Bundle : Bundles) {
21574 Bundle->setSchedulingPriority(Idx++);
21575 if (!Bundle->hasValidDependencies())
21576 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
21577 }
21578 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
21579 for (ScheduleCopyableData *SD : reverse(SDs)) {
21580 ScheduleBundle &Bundle = SD->getBundle();
21581 Bundle.setSchedulingPriority(Idx++);
21582 if (!Bundle.hasValidDependencies())
21583 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21584 }
21585 continue;
21586 }
21588 BS->getScheduleCopyableDataUsers(I);
21589 if (ScheduleData *SD = BS->getScheduleData(I)) {
21590 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
21591 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
21592 SDTEs.front()->doesNotNeedToSchedule() ||
21594 "scheduler and vectorizer bundle mismatch");
21595 SD->setSchedulingPriority(Idx++);
21596 if (!SD->hasValidDependencies() &&
21597 (!CopyableData.empty() ||
21598 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
21599 assert(TE->isGather() && "expected gather node");
21600 return TE->hasState() && TE->hasCopyableElements() &&
21601 TE->isCopyableElement(I);
21602 }))) {
21603 // Need to calculate deps for these nodes to correctly handle copyable
21604 // dependencies, even if they were cancelled.
21605 // If copyables bundle was cancelled, the deps are cleared and need to
21606 // recalculate them.
21607 ScheduleBundle Bundle;
21608 Bundle.add(SD);
21609 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21610 }
21611 }
21612 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
21613 ScheduleBundle &Bundle = SD->getBundle();
21614 Bundle.setSchedulingPriority(Idx++);
21615 if (!Bundle.hasValidDependencies())
21616 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21617 }
21618 }
21619 BS->initialFillReadyList(ReadyInsts);
21620
21621 Instruction *LastScheduledInst = BS->ScheduleEnd;
21622
21623 // Do the "real" scheduling.
21624 SmallPtrSet<Instruction *, 16> Scheduled;
21625 while (!ReadyInsts.empty()) {
21626 auto *Picked = *ReadyInsts.begin();
21627 ReadyInsts.erase(ReadyInsts.begin());
21628
21629 // Move the scheduled instruction(s) to their dedicated places, if not
21630 // there yet.
21631 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
21632 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21633 Instruction *PickedInst = BundleMember->getInst();
21634 // If copyable must be schedule as part of something else, skip it.
21635 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21636 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21637 (!IsCopyable && !Scheduled.insert(PickedInst).second))
21638 continue;
21639 if (PickedInst->getNextNode() != LastScheduledInst)
21640 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21641 LastScheduledInst = PickedInst;
21642 }
21643 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21644 LastScheduledInst);
21645 } else {
21646 auto *SD = cast<ScheduleData>(Picked);
21647 Instruction *PickedInst = SD->getInst();
21648 if (PickedInst->getNextNode() != LastScheduledInst)
21649 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21650 LastScheduledInst = PickedInst;
21651 }
21652 auto Invalid = InstructionsState::invalid();
21653 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
21654 }
21655
21656 // Check that we didn't break any of our invariants.
21657#ifdef EXPENSIVE_CHECKS
21658 BS->verify();
21659#endif
21660
21661#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21662 // Check that all schedulable entities got scheduled
21663 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21664 I = I->getNextNode()) {
21665 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21666 assert(all_of(Bundles,
21667 [](const ScheduleBundle *Bundle) {
21668 return Bundle->isScheduled();
21669 }) &&
21670 "must be scheduled at this point");
21671 }
21672#endif
21673
21674 // Avoid duplicate scheduling of the block.
21675 BS->ScheduleStart = nullptr;
21676}
21677
21679 // If V is a store, just return the width of the stored value (or value
21680 // truncated just before storing) without traversing the expression tree.
21681 // This is the common case.
21682 if (auto *Store = dyn_cast<StoreInst>(V))
21683 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21684
21685 if (auto *IEI = dyn_cast<InsertElementInst>(V))
21686 return getVectorElementSize(IEI->getOperand(1));
21687
21688 auto E = InstrElementSize.find(V);
21689 if (E != InstrElementSize.end())
21690 return E->second;
21691
21692 // If V is not a store, we can traverse the expression tree to find loads
21693 // that feed it. The type of the loaded value may indicate a more suitable
21694 // width than V's type. We want to base the vector element size on the width
21695 // of memory operations where possible.
21698 if (auto *I = dyn_cast<Instruction>(V)) {
21699 Worklist.emplace_back(I, I->getParent(), 0);
21700 Visited.insert(I);
21701 }
21702
21703 // Traverse the expression tree in bottom-up order looking for loads. If we
21704 // encounter an instruction we don't yet handle, we give up.
21705 auto Width = 0u;
21706 Value *FirstNonBool = nullptr;
21707 while (!Worklist.empty()) {
21708 auto [I, Parent, Level] = Worklist.pop_back_val();
21709
21710 // We should only be looking at scalar instructions here. If the current
21711 // instruction has a vector type, skip.
21712 auto *Ty = I->getType();
21713 if (isa<VectorType>(Ty))
21714 continue;
21715 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21716 FirstNonBool = I;
21717 if (Level > RecursionMaxDepth)
21718 continue;
21719
21720 // If the current instruction is a load, update MaxWidth to reflect the
21721 // width of the loaded value.
21723 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21724
21725 // Otherwise, we need to visit the operands of the instruction. We only
21726 // handle the interesting cases from buildTree here. If an operand is an
21727 // instruction we haven't yet visited and from the same basic block as the
21728 // user or the use is a PHI node, we add it to the worklist.
21731 for (Use &U : I->operands()) {
21732 if (auto *J = dyn_cast<Instruction>(U.get()))
21733 if (Visited.insert(J).second &&
21734 (isa<PHINode>(I) || J->getParent() == Parent)) {
21735 Worklist.emplace_back(J, J->getParent(), Level + 1);
21736 continue;
21737 }
21738 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
21739 FirstNonBool = U.get();
21740 }
21741 } else {
21742 break;
21743 }
21744 }
21745
21746 // If we didn't encounter a memory access in the expression tree, or if we
21747 // gave up for some reason, just return the width of V. Otherwise, return the
21748 // maximum width we found.
21749 if (!Width) {
21750 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21751 V = FirstNonBool;
21752 Width = DL->getTypeSizeInBits(V->getType());
21753 }
21754
21755 for (Instruction *I : Visited)
21756 InstrElementSize[I] = Width;
21757
21758 return Width;
21759}
21760
21761bool BoUpSLP::collectValuesToDemote(
21762 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
21764 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
21765 bool &IsProfitableToDemote, bool IsTruncRoot) const {
21766 // We can always demote constants.
21767 if (all_of(E.Scalars, IsaPred<Constant>))
21768 return true;
21769
21770 unsigned OrigBitWidth =
21771 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21772 if (OrigBitWidth == BitWidth) {
21773 MaxDepthLevel = 1;
21774 return true;
21775 }
21776
21777 // Check if the node was analyzed already and must keep its original bitwidth.
21778 if (NodesToKeepBWs.contains(E.Idx))
21779 return false;
21780
21781 // If the value is not a vectorized instruction in the expression and not used
21782 // by the insertelement instruction and not used in multiple vector nodes, it
21783 // cannot be demoted.
21784 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
21785 if (isa<PoisonValue>(R))
21786 return false;
21787 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21788 });
21789 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
21790 if (isa<PoisonValue>(V))
21791 return true;
21792 if (getTreeEntries(V).size() > 1)
21793 return false;
21794 // For lat shuffle of sext/zext with many uses need to check the extra bit
21795 // for unsigned values, otherwise may have incorrect casting for reused
21796 // scalars.
21797 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
21798 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
21799 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21800 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21801 return true;
21802 }
21803 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
21804 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21805 if (IsSignedNode)
21806 ++BitWidth1;
21807 if (auto *I = dyn_cast<Instruction>(V)) {
21808 APInt Mask = DB->getDemandedBits(I);
21809 unsigned BitWidth2 =
21810 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21811 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21812 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
21813 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21814 break;
21815 BitWidth2 *= 2;
21816 }
21817 BitWidth1 = std::min(BitWidth1, BitWidth2);
21818 }
21819 BitWidth = std::max(BitWidth, BitWidth1);
21820 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
21821 };
21822 auto FinalAnalysis = [&, TTI = TTI]() {
21823 if (!IsProfitableToDemote)
21824 return false;
21825 bool Res = all_of(
21826 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
21827 // Demote gathers.
21828 if (Res && E.isGather()) {
21829 if (E.hasState()) {
21830 if (const TreeEntry *SameTE =
21831 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
21832 SameTE)
21833 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
21834 ToDemote, Visited, NodesToKeepBWs,
21835 MaxDepthLevel, IsProfitableToDemote,
21836 IsTruncRoot)) {
21837 ToDemote.push_back(E.Idx);
21838 return true;
21839 }
21840 }
21841 // Check possible extractelement instructions bases and final vector
21842 // length.
21843 SmallPtrSet<Value *, 4> UniqueBases;
21844 for (Value *V : E.Scalars) {
21845 auto *EE = dyn_cast<ExtractElementInst>(V);
21846 if (!EE)
21847 continue;
21848 UniqueBases.insert(EE->getVectorOperand());
21849 }
21850 const unsigned VF = E.Scalars.size();
21851 Type *OrigScalarTy = E.Scalars.front()->getType();
21852 if (UniqueBases.size() <= 2 ||
21853 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
21855 *TTI,
21857 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
21858 VF))) {
21859 ToDemote.push_back(E.Idx);
21860 return true;
21861 }
21862 }
21863 return Res;
21864 };
21865 if (E.isGather() || !Visited.insert(&E).second ||
21866 any_of(E.Scalars, [&](Value *V) {
21867 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21868 return isa<InsertElementInst>(U) && !isVectorized(U);
21869 });
21870 }))
21871 return FinalAnalysis();
21872
21873 if (any_of(E.Scalars, [&](Value *V) {
21874 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21875 return isVectorized(U) ||
21876 (E.Idx == 0 && UserIgnoreList &&
21877 UserIgnoreList->contains(U)) ||
21878 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21879 !U->getType()->isScalableTy() &&
21880 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21881 }) && !IsPotentiallyTruncated(V, BitWidth);
21882 }))
21883 return false;
21884
21885 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
21886 bool &NeedToExit) {
21887 NeedToExit = false;
21888 unsigned InitLevel = MaxDepthLevel;
21889 for (const TreeEntry *Op : Operands) {
21890 unsigned Level = InitLevel;
21891 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
21892 ToDemote, Visited, NodesToKeepBWs, Level,
21893 IsProfitableToDemote, IsTruncRoot)) {
21894 if (!IsProfitableToDemote)
21895 return false;
21896 NeedToExit = true;
21897 if (!FinalAnalysis())
21898 return false;
21899 continue;
21900 }
21901 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21902 }
21903 return true;
21904 };
21905 auto AttemptCheckBitwidth =
21906 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
21907 // Try all bitwidth < OrigBitWidth.
21908 NeedToExit = false;
21909 unsigned BestFailBitwidth = 0;
21910 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
21911 if (Checker(BitWidth, OrigBitWidth))
21912 return true;
21913 if (BestFailBitwidth == 0 && FinalAnalysis())
21914 BestFailBitwidth = BitWidth;
21915 }
21916 if (BitWidth >= OrigBitWidth) {
21917 if (BestFailBitwidth == 0) {
21918 BitWidth = OrigBitWidth;
21919 return false;
21920 }
21921 MaxDepthLevel = 1;
21922 BitWidth = BestFailBitwidth;
21923 NeedToExit = true;
21924 return true;
21925 }
21926 return false;
21927 };
21928 auto TryProcessInstruction =
21929 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
21930 function_ref<bool(unsigned, unsigned)> Checker = {}) {
21931 if (Operands.empty()) {
21932 if (!IsTruncRoot)
21933 MaxDepthLevel = 1;
21934 for (Value *V : E.Scalars)
21935 (void)IsPotentiallyTruncated(V, BitWidth);
21936 } else {
21937 // Several vectorized uses? Check if we can truncate it, otherwise -
21938 // exit.
21939 if (any_of(E.Scalars, [&](Value *V) {
21940 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
21941 }))
21942 return false;
21943 bool NeedToExit = false;
21944 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21945 return false;
21946 if (NeedToExit)
21947 return true;
21948 if (!ProcessOperands(Operands, NeedToExit))
21949 return false;
21950 if (NeedToExit)
21951 return true;
21952 }
21953
21954 ++MaxDepthLevel;
21955 // Record the entry that we can demote.
21956 ToDemote.push_back(E.Idx);
21957 return IsProfitableToDemote;
21958 };
21959
21960 if (E.State == TreeEntry::SplitVectorize)
21961 return TryProcessInstruction(
21962 BitWidth,
21963 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
21964 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
21965
21966 switch (E.getOpcode()) {
21967
21968 // We can always demote truncations and extensions. Since truncations can
21969 // seed additional demotion, we save the truncated value.
21970 case Instruction::Trunc:
21971 if (IsProfitableToDemoteRoot)
21972 IsProfitableToDemote = true;
21973 return TryProcessInstruction(BitWidth);
21974 case Instruction::ZExt:
21975 case Instruction::SExt:
21976 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
21977 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
21978 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
21979 return false;
21980 IsProfitableToDemote = true;
21981 return TryProcessInstruction(BitWidth);
21982
21983 // We can demote certain binary operations if we can demote both of their
21984 // operands.
21985 case Instruction::Add:
21986 case Instruction::Sub:
21987 case Instruction::Mul:
21988 case Instruction::And:
21989 case Instruction::Or:
21990 case Instruction::Xor: {
21991 return TryProcessInstruction(
21992 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
21993 }
21994 case Instruction::Freeze:
21995 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
21996 case Instruction::Shl: {
21997 // If we are truncating the result of this SHL, and if it's a shift of an
21998 // inrange amount, we can always perform a SHL in a smaller type.
21999 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
22000 return all_of(E.Scalars, [&](Value *V) {
22001 if (isa<PoisonValue>(V))
22002 return true;
22003 auto *I = cast<Instruction>(V);
22004 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22005 return AmtKnownBits.getMaxValue().ult(BitWidth);
22006 });
22007 };
22008 return TryProcessInstruction(
22009 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
22010 }
22011 case Instruction::LShr: {
22012 // If this is a truncate of a logical shr, we can truncate it to a smaller
22013 // lshr iff we know that the bits we would otherwise be shifting in are
22014 // already zeros.
22015 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22016 return all_of(E.Scalars, [&](Value *V) {
22017 if (isa<PoisonValue>(V))
22018 return true;
22019 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22020 if (E.isCopyableElement(V))
22021 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22022 auto *I = cast<Instruction>(V);
22023 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22024 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22025 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22026 SimplifyQuery(*DL));
22027 });
22028 };
22029 return TryProcessInstruction(
22030 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22031 LShrChecker);
22032 }
22033 case Instruction::AShr: {
22034 // If this is a truncate of an arithmetic shr, we can truncate it to a
22035 // smaller ashr iff we know that all the bits from the sign bit of the
22036 // original type and the sign bit of the truncate type are similar.
22037 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22038 return all_of(E.Scalars, [&](Value *V) {
22039 if (isa<PoisonValue>(V))
22040 return true;
22041 auto *I = cast<Instruction>(V);
22042 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22043 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22044 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22045 ShiftedBits <
22046 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22047 });
22048 };
22049 return TryProcessInstruction(
22050 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22051 AShrChecker);
22052 }
22053 case Instruction::UDiv:
22054 case Instruction::URem: {
22055 // UDiv and URem can be truncated if all the truncated bits are zero.
22056 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22057 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22058 return all_of(E.Scalars, [&](Value *V) {
22059 auto *I = cast<Instruction>(V);
22060 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22061 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22062 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22063 });
22064 };
22065 return TryProcessInstruction(
22066 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
22067 }
22068
22069 // We can demote selects if we can demote their true and false values.
22070 case Instruction::Select: {
22071 return TryProcessInstruction(
22072 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
22073 }
22074
22075 // We can demote phis if we can demote all their incoming operands.
22076 case Instruction::PHI: {
22077 const unsigned NumOps = E.getNumOperands();
22079 transform(seq<unsigned>(0, NumOps), Ops.begin(),
22080 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
22081
22082 return TryProcessInstruction(BitWidth, Ops);
22083 }
22084
22085 case Instruction::Call: {
22086 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
22087 if (!IC)
22088 break;
22090 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
22091 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
22092 break;
22093 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
22094 function_ref<bool(unsigned, unsigned)> CallChecker;
22095 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22096 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22097 return all_of(E.Scalars, [&](Value *V) {
22098 auto *I = cast<Instruction>(V);
22099 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22100 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22101 return MaskedValueIsZero(I->getOperand(0), Mask,
22102 SimplifyQuery(*DL)) &&
22103 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22104 }
22105 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
22106 "Expected min/max intrinsics only.");
22107 unsigned SignBits = OrigBitWidth - BitWidth;
22108 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22109 unsigned Op0SignBits =
22110 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22111 unsigned Op1SignBits =
22112 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
22113 return SignBits <= Op0SignBits &&
22114 ((SignBits != Op0SignBits &&
22115 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22116 MaskedValueIsZero(I->getOperand(0), Mask,
22117 SimplifyQuery(*DL))) &&
22118 SignBits <= Op1SignBits &&
22119 ((SignBits != Op1SignBits &&
22120 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
22121 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
22122 });
22123 };
22124 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22125 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22126 return all_of(E.Scalars, [&](Value *V) {
22127 auto *I = cast<Instruction>(V);
22128 unsigned SignBits = OrigBitWidth - BitWidth;
22129 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22130 unsigned Op0SignBits =
22131 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22132 return SignBits <= Op0SignBits &&
22133 ((SignBits != Op0SignBits &&
22134 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22135 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22136 });
22137 };
22138 if (ID != Intrinsic::abs) {
22139 Operands.push_back(getOperandEntry(&E, 1));
22140 CallChecker = CompChecker;
22141 } else {
22142 CallChecker = AbsChecker;
22143 }
22144 InstructionCost BestCost =
22145 std::numeric_limits<InstructionCost::CostType>::max();
22146 unsigned BestBitWidth = BitWidth;
22147 unsigned VF = E.Scalars.size();
22148 // Choose the best bitwidth based on cost estimations.
22149 auto Checker = [&](unsigned BitWidth, unsigned) {
22150 unsigned MinBW = PowerOf2Ceil(BitWidth);
22151 SmallVector<Type *> ArgTys =
22152 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
22153 auto VecCallCosts = getVectorCallCosts(
22154 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
22155 TTI, TLI, ArgTys);
22156 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
22157 if (Cost < BestCost) {
22158 BestCost = Cost;
22159 BestBitWidth = BitWidth;
22160 }
22161 return false;
22162 };
22163 [[maybe_unused]] bool NeedToExit;
22164 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22165 BitWidth = BestBitWidth;
22166 return TryProcessInstruction(BitWidth, Operands, CallChecker);
22167 }
22168
22169 // Otherwise, conservatively give up.
22170 default:
22171 break;
22172 }
22173 MaxDepthLevel = 1;
22174 return FinalAnalysis();
22175}
22176
22177static RecurKind getRdxKind(Value *V);
22178
22180 // We only attempt to truncate integer expressions.
22181 bool IsStoreOrInsertElt =
22182 VectorizableTree.front()->hasState() &&
22183 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22184 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22185 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22186 ExtraBitWidthNodes.size() <= 1 &&
22187 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22188 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22189 return;
22190
22191 unsigned NodeIdx = 0;
22192 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22193 NodeIdx = 1;
22194
22195 // Ensure the roots of the vectorizable tree don't form a cycle.
22196 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
22197 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22198 "Unexpected tree is graph.");
22199
22200 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
22201 // resize to the final type.
22202 bool IsTruncRoot = false;
22203 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22204 SmallVector<unsigned> RootDemotes;
22205 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
22206 if (NodeIdx != 0 &&
22207 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22208 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22209 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
22210 IsTruncRoot = true;
22211 RootDemotes.push_back(NodeIdx);
22212 IsProfitableToDemoteRoot = true;
22213 ++NodeIdx;
22214 }
22215
22216 // Analyzed the reduction already and not profitable - exit.
22217 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22218 return;
22219
22220 SmallVector<unsigned> ToDemote;
22221 auto ComputeMaxBitWidth =
22222 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
22223 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
22224 ToDemote.clear();
22225 // Check if the root is trunc and the next node is gather/buildvector, then
22226 // keep trunc in scalars, which is free in most cases.
22227 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22228 !NodesToKeepBWs.contains(E.Idx) &&
22229 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22230 all_of(E.Scalars, [&](Value *V) {
22231 return V->hasOneUse() || isa<Constant>(V) ||
22232 (!V->hasNUsesOrMore(UsesLimit) &&
22233 none_of(V->users(), [&](User *U) {
22234 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22235 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22236 if (TEs.empty() || is_contained(TEs, UserTE))
22237 return false;
22238 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22239 SelectInst>(U) ||
22240 isa<SIToFPInst, UIToFPInst>(U) ||
22241 (UserTE->hasState() &&
22242 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22243 SelectInst>(UserTE->getMainOp()) ||
22244 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22245 return true;
22246 unsigned UserTESz = DL->getTypeSizeInBits(
22247 UserTE->Scalars.front()->getType());
22248 if (all_of(TEs, [&](const TreeEntry *TE) {
22249 auto It = MinBWs.find(TE);
22250 return It != MinBWs.end() &&
22251 It->second.first > UserTESz;
22252 }))
22253 return true;
22254 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22255 }));
22256 })) {
22257 ToDemote.push_back(E.Idx);
22258 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22259 auto It = MinBWs.find(UserTE);
22260 if (It != MinBWs.end())
22261 return It->second.first;
22262 unsigned MaxBitWidth =
22263 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22264 MaxBitWidth = bit_ceil(MaxBitWidth);
22265 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22266 MaxBitWidth = 8;
22267 return MaxBitWidth;
22268 }
22269
22270 if (!E.hasState())
22271 return 0u;
22272
22273 unsigned VF = E.getVectorFactor();
22274 Type *ScalarTy = E.Scalars.front()->getType();
22275 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22276 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
22277 if (!TreeRootIT)
22278 return 0u;
22279
22280 if (any_of(E.Scalars,
22281 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
22282 return 0u;
22283
22284 unsigned NumParts = ::getNumberOfParts(
22285 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
22286
22287 // The maximum bit width required to represent all the values that can be
22288 // demoted without loss of precision. It would be safe to truncate the roots
22289 // of the expression to this width.
22290 unsigned MaxBitWidth = 1u;
22291
22292 // True if the roots can be zero-extended back to their original type,
22293 // rather than sign-extended. We know that if the leading bits are not
22294 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
22295 // True.
22296 // Determine if the sign bit of all the roots is known to be zero. If not,
22297 // IsKnownPositive is set to False.
22298 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
22299 if (isa<PoisonValue>(R))
22300 return true;
22301 KnownBits Known = computeKnownBits(R, *DL);
22302 return Known.isNonNegative();
22303 });
22304
22305 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22306 E.UserTreeIndex.UserTE->hasState() &&
22307 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22308 MaxBitWidth =
22309 std::min(DL->getTypeSizeInBits(
22310 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22311 DL->getTypeSizeInBits(ScalarTy));
22312
22313 // We first check if all the bits of the roots are demanded. If they're not,
22314 // we can truncate the roots to this narrower type.
22315 for (Value *Root : E.Scalars) {
22316 if (isa<PoisonValue>(Root))
22317 continue;
22318 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
22319 TypeSize NumTypeBits =
22320 DL->getTypeSizeInBits(Root->getType()->getScalarType());
22321 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22322 // If we can't prove that the sign bit is zero, we must add one to the
22323 // maximum bit width to account for the unknown sign bit. This preserves
22324 // the existing sign bit so we can safely sign-extend the root back to the
22325 // original type. Otherwise, if we know the sign bit is zero, we will
22326 // zero-extend the root instead.
22327 //
22328 // FIXME: This is somewhat suboptimal, as there will be cases where adding
22329 // one to the maximum bit width will yield a larger-than-necessary
22330 // type. In general, we need to add an extra bit only if we can't
22331 // prove that the upper bit of the original type is equal to the
22332 // upper bit of the proposed smaller type. If these two bits are
22333 // the same (either zero or one) we know that sign-extending from
22334 // the smaller type will result in the same value. Here, since we
22335 // can't yet prove this, we are just making the proposed smaller
22336 // type larger to ensure correctness.
22337 if (!IsKnownPositive)
22338 ++BitWidth1;
22339
22340 auto *I = dyn_cast<Instruction>(Root);
22341 if (!I) {
22342 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22343 continue;
22344 }
22345 APInt Mask = DB->getDemandedBits(I);
22346 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22347 MaxBitWidth =
22348 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22349 }
22350
22351 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22352 MaxBitWidth = 8;
22353
22354 // If the original type is large, but reduced type does not improve the reg
22355 // use - ignore it.
22356 if (NumParts > 1 &&
22357 NumParts ==
22359 *TTI, getWidenedType(IntegerType::get(F->getContext(),
22360 bit_ceil(MaxBitWidth)),
22361 VF)))
22362 return 0u;
22363
22364 unsigned Opcode = E.getOpcode();
22365 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22366 Opcode == Instruction::SExt ||
22367 Opcode == Instruction::ZExt || NumParts > 1;
22368 // Conservatively determine if we can actually truncate the roots of the
22369 // expression. Collect the values that can be demoted in ToDemote and
22370 // additional roots that require investigating in Roots.
22372 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22373 bool NeedToDemote = IsProfitableToDemote;
22374
22375 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22376 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22377 NeedToDemote, IsTruncRoot) ||
22378 (MaxDepthLevel <= Limit &&
22379 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22380 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22381 DL->getTypeSizeInBits(TreeRootIT) /
22382 DL->getTypeSizeInBits(
22383 E.getMainOp()->getOperand(0)->getType()) >
22384 2)))))
22385 return 0u;
22386 // Round MaxBitWidth up to the next power-of-two.
22387 MaxBitWidth = bit_ceil(MaxBitWidth);
22388
22389 return MaxBitWidth;
22390 };
22391
22392 // If we can truncate the root, we must collect additional values that might
22393 // be demoted as a result. That is, those seeded by truncations we will
22394 // modify.
22395 // Add reduction ops sizes, if any.
22396 if (UserIgnoreList &&
22397 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22398 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
22399 // x i1> to in)).
22400 if (all_of(*UserIgnoreList,
22401 [](Value *V) {
22402 return isa<PoisonValue>(V) ||
22403 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22404 }) &&
22405 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22406 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22407 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22408 Builder.getInt1Ty()) {
22409 ReductionBitWidth = 1;
22410 } else {
22411 for (Value *V : *UserIgnoreList) {
22412 if (isa<PoisonValue>(V))
22413 continue;
22414 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22415 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
22416 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22418 ++BitWidth1;
22419 unsigned BitWidth2 = BitWidth1;
22422 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22423 }
22424 ReductionBitWidth =
22425 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22426 }
22427 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22428 ReductionBitWidth = 8;
22429
22430 ReductionBitWidth = bit_ceil(ReductionBitWidth);
22431 }
22432 }
22433 bool IsTopRoot = NodeIdx == 0;
22434 while (NodeIdx < VectorizableTree.size() &&
22435 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22436 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22437 RootDemotes.push_back(NodeIdx);
22438 ++NodeIdx;
22439 IsTruncRoot = true;
22440 }
22441 bool IsSignedCmp = false;
22442 if (UserIgnoreList &&
22443 all_of(*UserIgnoreList,
22445 m_SMax(m_Value(), m_Value())))))
22446 IsSignedCmp = true;
22447 while (NodeIdx < VectorizableTree.size()) {
22448 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
22449 unsigned Limit = 2;
22450 if (IsTopRoot &&
22451 ReductionBitWidth ==
22452 DL->getTypeSizeInBits(
22453 VectorizableTree.front()->Scalars.front()->getType()))
22454 Limit = 3;
22455 unsigned MaxBitWidth = ComputeMaxBitWidth(
22456 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22457 IsTruncRoot, IsSignedCmp);
22458 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22459 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22460 ReductionBitWidth = bit_ceil(MaxBitWidth);
22461 else if (MaxBitWidth == 0)
22462 ReductionBitWidth = 0;
22463 }
22464
22465 for (unsigned Idx : RootDemotes) {
22466 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
22467 uint32_t OrigBitWidth =
22468 DL->getTypeSizeInBits(V->getType()->getScalarType());
22469 if (OrigBitWidth > MaxBitWidth) {
22470 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
22471 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22472 }
22473 return false;
22474 }))
22475 ToDemote.push_back(Idx);
22476 }
22477 RootDemotes.clear();
22478 IsTopRoot = false;
22479 IsProfitableToDemoteRoot = true;
22480
22481 if (ExtraBitWidthNodes.empty()) {
22482 NodeIdx = VectorizableTree.size();
22483 } else {
22484 unsigned NewIdx = 0;
22485 do {
22486 NewIdx = *ExtraBitWidthNodes.begin();
22487 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22488 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22489 NodeIdx = NewIdx;
22490 IsTruncRoot =
22491 NodeIdx < VectorizableTree.size() &&
22492 VectorizableTree[NodeIdx]->UserTreeIndex &&
22493 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22494 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22495 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22496 Instruction::Trunc &&
22497 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22498 IsSignedCmp =
22499 NodeIdx < VectorizableTree.size() &&
22500 VectorizableTree[NodeIdx]->UserTreeIndex &&
22501 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22502 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22503 Instruction::ICmp &&
22504 any_of(
22505 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22506 [&](Value *V) {
22507 auto *IC = dyn_cast<ICmpInst>(V);
22508 return IC && (IC->isSigned() ||
22509 !isKnownNonNegative(IC->getOperand(0),
22510 SimplifyQuery(*DL)) ||
22511 !isKnownNonNegative(IC->getOperand(1),
22512 SimplifyQuery(*DL)));
22513 });
22514 }
22515
22516 // If the maximum bit width we compute is less than the width of the roots'
22517 // type, we can proceed with the narrowing. Otherwise, do nothing.
22518 if (MaxBitWidth == 0 ||
22519 MaxBitWidth >=
22520 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
22521 ->getBitWidth()) {
22522 if (UserIgnoreList)
22523 AnalyzedMinBWVals.insert_range(TreeRoot);
22524 NodesToKeepBWs.insert_range(ToDemote);
22525 continue;
22526 }
22527
22528 // Finally, map the values we can demote to the maximum bit with we
22529 // computed.
22530 for (unsigned Idx : ToDemote) {
22531 TreeEntry *TE = VectorizableTree[Idx].get();
22532 if (MinBWs.contains(TE))
22533 continue;
22534 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
22535 if (isa<PoisonValue>(R))
22536 return false;
22537 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22538 });
22539 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22540 }
22541 }
22542}
22543
22545 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
22546 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
22548 auto *AA = &AM.getResult<AAManager>(F);
22549 auto *LI = &AM.getResult<LoopAnalysis>(F);
22550 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
22551 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
22552 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
22554
22555 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
22556 if (!Changed)
22557 return PreservedAnalyses::all();
22558
22561 return PA;
22562}
22563
22565 TargetTransformInfo *TTI_,
22566 TargetLibraryInfo *TLI_, AAResults *AA_,
22567 LoopInfo *LI_, DominatorTree *DT_,
22568 AssumptionCache *AC_, DemandedBits *DB_,
22571 return false;
22572 SE = SE_;
22573 TTI = TTI_;
22574 TLI = TLI_;
22575 AA = AA_;
22576 LI = LI_;
22577 DT = DT_;
22578 AC = AC_;
22579 DB = DB_;
22580 DL = &F.getDataLayout();
22581
22582 Stores.clear();
22583 GEPs.clear();
22584 bool Changed = false;
22585
22586 // If the target claims to have no vector registers don't attempt
22587 // vectorization.
22588 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
22589 LLVM_DEBUG(
22590 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
22591 return false;
22592 }
22593
22594 // Don't vectorize when the attribute NoImplicitFloat is used.
22595 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
22596 return false;
22597
22598 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
22599
22600 // Use the bottom up slp vectorizer to construct chains that start with
22601 // store instructions.
22602 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
22603
22604 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
22605 // delete instructions.
22606
22607 // Update DFS numbers now so that we can use them for ordering.
22608 DT->updateDFSNumbers();
22609
22610 // Scan the blocks in the function in post order.
22611 for (auto *BB : post_order(&F.getEntryBlock())) {
22613 continue;
22614
22615 // Start new block - clear the list of reduction roots.
22616 R.clearReductionData();
22617 collectSeedInstructions(BB);
22618
22619 // Vectorize trees that end at stores.
22620 if (!Stores.empty()) {
22621 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
22622 << " underlying objects.\n");
22623 Changed |= vectorizeStoreChains(R);
22624 }
22625
22626 // Vectorize trees that end at reductions.
22627 Changed |= vectorizeChainsInBlock(BB, R);
22628
22629 // Vectorize the index computations of getelementptr instructions. This
22630 // is primarily intended to catch gather-like idioms ending at
22631 // non-consecutive loads.
22632 if (!GEPs.empty()) {
22633 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
22634 << " underlying objects.\n");
22635 Changed |= vectorizeGEPIndices(BB, R);
22636 }
22637 }
22638
22639 if (Changed) {
22640 R.optimizeGatherSequence();
22641 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
22642 }
22643 return Changed;
22644}
22645
22646std::optional<bool>
22647SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
22648 unsigned Idx, unsigned MinVF,
22649 unsigned &Size) {
22650 Size = 0;
22651 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
22652 << "\n");
22653 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22654 unsigned VF = Chain.size();
22655
22656 if (!has_single_bit(Sz) ||
22658 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
22659 VF) ||
22660 VF < 2 || VF < MinVF) {
22661 // Check if vectorizing with a non-power-of-2 VF should be considered. At
22662 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
22663 // all vector lanes are used.
22664 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
22665 return false;
22666 }
22667
22668 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
22669 << "\n");
22670
22671 SetVector<Value *> ValOps;
22672 for (Value *V : Chain)
22673 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
22674 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
22675 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
22676 InstructionsState S = Analysis.buildInstructionsState(
22677 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
22678 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
22679 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
22680 bool IsAllowedSize =
22681 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
22682 ValOps.size()) ||
22683 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
22684 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22685 (!S.getMainOp()->isSafeToRemove() ||
22686 any_of(ValOps.getArrayRef(),
22687 [&](Value *V) {
22688 return !isa<ExtractElementInst>(V) &&
22689 (V->getNumUses() > Chain.size() ||
22690 any_of(V->users(), [&](User *U) {
22691 return !Stores.contains(U);
22692 }));
22693 }))) ||
22694 (ValOps.size() > Chain.size() / 2 && !S)) {
22695 Size = (!IsAllowedSize && S) ? 1 : 2;
22696 return false;
22697 }
22698 }
22699 if (R.isLoadCombineCandidate(Chain))
22700 return true;
22701 R.buildTree(Chain);
22702 // Check if tree tiny and store itself or its value is not vectorized.
22703 if (R.isTreeTinyAndNotFullyVectorizable()) {
22704 if (R.isGathered(Chain.front()) ||
22705 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
22706 return std::nullopt;
22707 Size = R.getCanonicalGraphSize();
22708 return false;
22709 }
22710 if (R.isProfitableToReorder()) {
22711 R.reorderTopToBottom();
22712 R.reorderBottomToTop();
22713 }
22714 R.transformNodes();
22715 R.buildExternalUses();
22716
22717 R.computeMinimumValueSizes();
22718
22719 Size = R.getCanonicalGraphSize();
22720 if (S && S.getOpcode() == Instruction::Load)
22721 Size = 2; // cut off masked gather small trees
22722 InstructionCost Cost = R.getTreeCost();
22723
22724 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
22725 if (Cost < -SLPCostThreshold) {
22726 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
22727
22728 using namespace ore;
22729
22730 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
22731 cast<StoreInst>(Chain[0]))
22732 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
22733 << " and with tree size "
22734 << NV("TreeSize", R.getTreeSize()));
22735
22736 R.vectorizeTree();
22737 return true;
22738 }
22739
22740 return false;
22741}
22742
22743/// Checks if the quadratic mean deviation is less than 90% of the mean size.
22744static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
22745 bool First) {
22746 unsigned Num = 0;
22747 uint64_t Sum = std::accumulate(
22748 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22749 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22750 unsigned Size = First ? Val.first : Val.second;
22751 if (Size == 1)
22752 return V;
22753 ++Num;
22754 return V + Size;
22755 });
22756 if (Num == 0)
22757 return true;
22758 uint64_t Mean = Sum / Num;
22759 if (Mean == 0)
22760 return true;
22761 uint64_t Dev = std::accumulate(
22762 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22763 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22764 unsigned P = First ? Val.first : Val.second;
22765 if (P == 1)
22766 return V;
22767 return V + (P - Mean) * (P - Mean);
22768 }) /
22769 Num;
22770 return Dev * 96 / (Mean * Mean) == 0;
22771}
22772
22773namespace {
22774
22775/// A group of stores that we'll try to bundle together using vector ops.
22776/// They are ordered using the signed distance of their address operand to the
22777/// address of this group's BaseInstr.
22778class RelatedStoreInsts {
22779public:
22780 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
22781 : AllStores(AllStores) {
22782 reset(BaseInstrIdx);
22783 }
22784
22785 void reset(unsigned NewBaseInstr) {
22786 assert(NewBaseInstr < AllStores.size() &&
22787 "Instruction index out of bounds");
22788 BaseInstrIdx = NewBaseInstr;
22789 Instrs.clear();
22790 insertOrLookup(NewBaseInstr, 0);
22791 }
22792
22793 /// Tries to insert \p InstrIdx as the store with a pointer distance of
22794 /// \p PtrDist.
22795 /// Does nothing if there is already a store with that \p PtrDist.
22796 /// \returns The previously associated Instruction index, or std::nullopt
22797 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
22798 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22799 return Inserted ? std::nullopt : std::make_optional(It->second);
22800 }
22801
22802 using DistToInstMap = std::map<int64_t, unsigned>;
22803 const DistToInstMap &getStores() const { return Instrs; }
22804
22805 /// If \p SI is related to this group of stores, return the distance of its
22806 /// pointer operand to the one the group's BaseInstr.
22807 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
22808 ScalarEvolution &SE) const {
22809 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22810 return getPointersDiff(
22811 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
22812 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
22813 /*StrictCheck=*/true);
22814 }
22815
22816 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
22817 /// Stores whose index is less than \p MinSafeIdx will be dropped.
22818 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
22819 int64_t DistFromCurBase) {
22820 DistToInstMap PrevSet = std::move(Instrs);
22821 reset(NewBaseInstIdx);
22822
22823 // Re-insert stores that come after MinSafeIdx to try and vectorize them
22824 // again. Their distance will be "rebased" to use NewBaseInstIdx as
22825 // reference.
22826 for (auto [Dist, InstIdx] : PrevSet) {
22827 if (InstIdx >= MinSafeIdx)
22828 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22829 }
22830 }
22831
22832 /// Remove all stores that have been vectorized from this group.
22833 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
22834 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
22835 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
22836 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
22837 });
22838
22839 // Get a forward iterator pointing after the last vectorized store and erase
22840 // all stores before it so we don't try to vectorize them again.
22841 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22842 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22843 }
22844
22845private:
22846 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
22847 unsigned BaseInstrIdx;
22848
22849 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
22850 DistToInstMap Instrs;
22851
22852 /// Reference to all the stores in the BB being analyzed.
22853 ArrayRef<StoreInst *> AllStores;
22854};
22855
22856} // end anonymous namespace
22857
22858bool SLPVectorizerPass::vectorizeStores(
22859 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
22860 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22861 &Visited) {
22862 // We may run into multiple chains that merge into a single chain. We mark the
22863 // stores that we vectorized so that we don't visit the same store twice.
22864 BoUpSLP::ValueSet VectorizedStores;
22865 bool Changed = false;
22866
22867 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22868 int64_t PrevDist = -1;
22870 // Collect the chain into a list.
22871 for (auto [Idx, Data] : enumerate(StoreSeq)) {
22872 auto &[Dist, InstIdx] = Data;
22873 if (Operands.empty() || Dist - PrevDist == 1) {
22874 Operands.push_back(Stores[InstIdx]);
22875 PrevDist = Dist;
22876 if (Idx != StoreSeq.size() - 1)
22877 continue;
22878 }
22879 auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
22880 Operands.clear();
22881 Operands.push_back(Stores[InstIdx]);
22882 PrevDist = Dist;
22883 });
22884
22885 if (Operands.size() <= 1 ||
22886 !Visited
22887 .insert({Operands.front(),
22888 cast<StoreInst>(Operands.front())->getValueOperand(),
22889 Operands.back(),
22890 cast<StoreInst>(Operands.back())->getValueOperand(),
22891 Operands.size()})
22892 .second)
22893 continue;
22894
22895 unsigned MaxVecRegSize = R.getMaxVecRegSize();
22896 unsigned EltSize = R.getVectorElementSize(Operands[0]);
22897 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
22898
22899 unsigned MaxVF =
22900 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22901 auto *Store = cast<StoreInst>(Operands[0]);
22902 Type *StoreTy = Store->getValueOperand()->getType();
22903 Type *ValueTy = StoreTy;
22904 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
22905 ValueTy = Trunc->getSrcTy();
22906 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
22907 // getStoreMinimumVF only support scalar type as arguments. As a result,
22908 // we need to use the element type of StoreTy and ValueTy to retrieve the
22909 // VF and then transform it back.
22910 // Remember: VF is defined as the number we want to vectorize, not the
22911 // number of elements in the final vector.
22912 Type *StoreScalarTy = StoreTy->getScalarType();
22913 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
22914 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22915 ValueTy->getScalarType()));
22916 MinVF /= getNumElements(StoreTy);
22917 MinVF = std::max<unsigned>(2, MinVF);
22918
22919 if (MaxVF < MinVF) {
22920 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
22921 << ") < "
22922 << "MinVF (" << MinVF << ")\n");
22923 continue;
22924 }
22925
22926 unsigned NonPowerOf2VF = 0;
22928 // First try vectorizing with a non-power-of-2 VF. At the moment, only
22929 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
22930 // lanes are used.
22931 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
22932 if (has_single_bit(CandVF + 1)) {
22933 NonPowerOf2VF = CandVF;
22934 assert(NonPowerOf2VF != MaxVF &&
22935 "Non-power-of-2 VF should not be equal to MaxVF");
22936 }
22937 }
22938
22939 // MaxRegVF represents the number of instructions (scalar, or vector in
22940 // case of revec) that can be vectorized to naturally fit in a vector
22941 // register.
22942 unsigned MaxRegVF = MaxVF;
22943
22944 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
22945 if (MaxVF < MinVF) {
22946 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
22947 << ") < "
22948 << "MinVF (" << MinVF << ")\n");
22949 continue;
22950 }
22951
22952 SmallVector<unsigned> CandidateVFs;
22953 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
22954 VF = divideCeil(VF, 2))
22955 CandidateVFs.push_back(VF);
22956
22957 unsigned End = Operands.size();
22958 unsigned Repeat = 0;
22959 constexpr unsigned MaxAttempts = 4;
22960 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
22961 for (std::pair<unsigned, unsigned> &P : RangeSizes)
22962 P.first = P.second = 1;
22963 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
22964 auto IsNotVectorized = [](bool First,
22965 const std::pair<unsigned, unsigned> &P) {
22966 return First ? P.first > 0 : P.second > 0;
22967 };
22968 auto IsVectorized = [](bool First,
22969 const std::pair<unsigned, unsigned> &P) {
22970 return First ? P.first == 0 : P.second == 0;
22971 };
22972 auto VFIsProfitable = [](bool First, unsigned Size,
22973 const std::pair<unsigned, unsigned> &P) {
22974 return First ? Size >= P.first : Size >= P.second;
22975 };
22976 auto FirstSizeSame = [](unsigned Size,
22977 const std::pair<unsigned, unsigned> &P) {
22978 return Size == P.first;
22979 };
22980 while (true) {
22981 ++Repeat;
22982 bool RepeatChanged = false;
22983 bool AnyProfitableGraph = false;
22984 for (unsigned VF : CandidateVFs) {
22985 AnyProfitableGraph = false;
22986 unsigned FirstUnvecStore =
22987 std::distance(RangeSizes.begin(),
22988 find_if(RangeSizes, std::bind(IsNotVectorized,
22989 VF >= MaxRegVF, _1)));
22990
22991 // Form slices of size VF starting from FirstUnvecStore and try to
22992 // vectorize them.
22993 while (FirstUnvecStore < End) {
22994 unsigned FirstVecStore = std::distance(
22995 RangeSizes.begin(),
22996 find_if(RangeSizes.drop_front(FirstUnvecStore),
22997 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
22998 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
22999 for (unsigned SliceStartIdx = FirstUnvecStore;
23000 SliceStartIdx + VF <= MaxSliceEnd;) {
23001 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
23002 VF >= MaxRegVF)) {
23003 ++SliceStartIdx;
23004 continue;
23005 }
23006 ArrayRef<Value *> Slice =
23007 ArrayRef(Operands).slice(SliceStartIdx, VF);
23008 assert(all_of(Slice,
23009 [&](Value *V) {
23010 return cast<StoreInst>(V)
23011 ->getValueOperand()
23012 ->getType() ==
23013 cast<StoreInst>(Slice.front())
23014 ->getValueOperand()
23015 ->getType();
23016 }) &&
23017 "Expected all operands of same type.");
23018 if (!NonSchedulable.empty()) {
23019 auto [NonSchedSizeMax, NonSchedSizeMin] =
23020 NonSchedulable.lookup(Slice.front());
23021 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23022 // VF is too ambitious. Try to vectorize another slice before
23023 // trying a smaller VF.
23024 SliceStartIdx += NonSchedSizeMax;
23025 continue;
23026 }
23027 }
23028 unsigned TreeSize;
23029 std::optional<bool> Res =
23030 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23031 if (!Res) {
23032 // Update the range of non schedulable VFs for slices starting
23033 // at SliceStartIdx.
23034 NonSchedulable
23035 .try_emplace(Slice.front(), std::make_pair(VF, VF))
23036 .first->getSecond()
23037 .second = VF;
23038 } else if (*Res) {
23039 // Mark the vectorized stores so that we don't vectorize them
23040 // again.
23041 VectorizedStores.insert_range(Slice);
23042 // Mark the vectorized stores so that we don't vectorize them
23043 // again.
23044 AnyProfitableGraph = RepeatChanged = Changed = true;
23045 // If we vectorized initial block, no need to try to vectorize
23046 // it again.
23047 for (std::pair<unsigned, unsigned> &P :
23048 RangeSizes.slice(SliceStartIdx, VF))
23049 P.first = P.second = 0;
23050 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23051 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
23052 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23053 P.first = P.second = 0;
23054 FirstUnvecStore = SliceStartIdx + VF;
23055 }
23056 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23057 for (std::pair<unsigned, unsigned> &P :
23058 RangeSizes.slice(SliceStartIdx + VF,
23059 MaxSliceEnd - (SliceStartIdx + VF)))
23060 P.first = P.second = 0;
23061 if (MaxSliceEnd == End)
23062 End = SliceStartIdx;
23063 MaxSliceEnd = SliceStartIdx;
23064 }
23065 SliceStartIdx += VF;
23066 continue;
23067 }
23068 if (VF > 2 && Res &&
23069 !all_of(RangeSizes.slice(SliceStartIdx, VF),
23070 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23071 _1))) {
23072 SliceStartIdx += VF;
23073 continue;
23074 }
23075 // Check for the very big VFs that we're not rebuilding same
23076 // trees, just with larger number of elements.
23077 if (VF > MaxRegVF && TreeSize > 1 &&
23078 all_of(RangeSizes.slice(SliceStartIdx, VF),
23079 std::bind(FirstSizeSame, TreeSize, _1))) {
23080 SliceStartIdx += VF;
23081 while (SliceStartIdx != MaxSliceEnd &&
23082 RangeSizes[SliceStartIdx].first == TreeSize)
23083 ++SliceStartIdx;
23084 continue;
23085 }
23086 if (TreeSize > 1) {
23087 for (std::pair<unsigned, unsigned> &P :
23088 RangeSizes.slice(SliceStartIdx, VF)) {
23089 if (VF >= MaxRegVF)
23090 P.second = std::max(P.second, TreeSize);
23091 else
23092 P.first = std::max(P.first, TreeSize);
23093 }
23094 }
23095 ++SliceStartIdx;
23096 AnyProfitableGraph = true;
23097 }
23098 if (FirstUnvecStore >= End)
23099 break;
23100 if (MaxSliceEnd - FirstUnvecStore < VF &&
23101 MaxSliceEnd - FirstUnvecStore >= MinVF)
23102 AnyProfitableGraph = true;
23103 FirstUnvecStore = std::distance(
23104 RangeSizes.begin(),
23105 find_if(RangeSizes.drop_front(MaxSliceEnd),
23106 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23107 }
23108 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
23109 break;
23110 }
23111 // All values vectorized - exit.
23112 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
23113 return P.first == 0 && P.second == 0;
23114 }))
23115 break;
23116 // Check if tried all attempts or no need for the last attempts at all.
23117 if (Repeat >= MaxAttempts ||
23118 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23119 break;
23120 constexpr unsigned StoresLimit = 64;
23121 const unsigned MaxTotalNum = std::min<unsigned>(
23122 Operands.size(),
23123 static_cast<unsigned>(
23124 End -
23125 std::distance(
23126 RangeSizes.begin(),
23127 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
23128 1));
23129 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
23130 unsigned Limit =
23131 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
23132 CandidateVFs.clear();
23133 if (bit_floor(Limit) == VF)
23134 CandidateVFs.push_back(Limit);
23135 if (VF > MaxTotalNum || VF >= StoresLimit)
23136 break;
23137 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
23138 if (P.first != 0)
23139 P.first = std::max(P.second, P.first);
23140 }
23141 // Last attempt to vectorize max number of elements, if all previous
23142 // attempts were unsuccessful because of the cost issues.
23143 CandidateVFs.push_back(VF);
23144 }
23145 }
23146 };
23147
23148 /// Groups of stores to vectorize
23149 SmallVector<RelatedStoreInsts> SortedStores;
23150
23151 // Inserts the specified store SI with the given index Idx to the set of the
23152 // stores. If the store with the same distance is found already - stop
23153 // insertion, try to vectorize already found stores. If some stores from this
23154 // sequence were not vectorized - try to vectorize them with the new store
23155 // later. But this logic is applied only to the stores, that come before the
23156 // previous store with the same distance.
23157 // Example:
23158 // 1. store x, %p
23159 // 2. store y, %p+1
23160 // 3. store z, %p+2
23161 // 4. store a, %p
23162 // 5. store b, %p+3
23163 // - Scan this from the last to first store. The very first bunch of stores is
23164 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
23165 // vector).
23166 // - The next store in the list - #1 - has the same distance from store #5 as
23167 // the store #4.
23168 // - Try to vectorize sequence of stores 4,2,3,5.
23169 // - If all these stores are vectorized - just drop them.
23170 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
23171 // - Start new stores sequence.
23172 // The new bunch of stores is {1, {1, 0}}.
23173 // - Add the stores from previous sequence, that were not vectorized.
23174 // Here we consider the stores in the reversed order, rather they are used in
23175 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
23176 // Store #3 can be added -> comes after store #4 with the same distance as
23177 // store #1.
23178 // Store #5 cannot be added - comes before store #4.
23179 // This logic allows to improve the compile time, we assume that the stores
23180 // after previous store with the same distance most likely have memory
23181 // dependencies and no need to waste compile time to try to vectorize them.
23182 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
23183 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
23184 std::optional<int64_t> PtrDist;
23185 auto *RelatedStores = find_if(
23186 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
23187 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23188 return PtrDist.has_value();
23189 });
23190
23191 // We did not find a comparable store, start a new group.
23192 if (RelatedStores == SortedStores.end()) {
23193 SortedStores.emplace_back(Idx, Stores);
23194 return;
23195 }
23196
23197 // If there is already a store in the group with the same PtrDiff, try to
23198 // vectorize the existing instructions before adding the current store.
23199 // Otherwise, insert this store and keep collecting.
23200 if (std::optional<unsigned> PrevInst =
23201 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23202 TryToVectorize(RelatedStores->getStores());
23203 RelatedStores->clearVectorizedStores(VectorizedStores);
23204 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
23205 /*NewBaseInstIdx=*/Idx,
23206 /*DistFromCurBase=*/*PtrDist);
23207 }
23208 };
23209 Type *PrevValTy = nullptr;
23210 for (auto [I, SI] : enumerate(Stores)) {
23211 if (R.isDeleted(SI))
23212 continue;
23213 if (!PrevValTy)
23214 PrevValTy = SI->getValueOperand()->getType();
23215 // Check that we do not try to vectorize stores of different types.
23216 if (PrevValTy != SI->getValueOperand()->getType()) {
23217 for (RelatedStoreInsts &StoreSeq : SortedStores)
23218 TryToVectorize(StoreSeq.getStores());
23219 SortedStores.clear();
23220 PrevValTy = SI->getValueOperand()->getType();
23221 }
23222 FillStoresSet(I, SI);
23223 }
23224
23225 // Final vectorization attempt.
23226 for (RelatedStoreInsts &StoreSeq : SortedStores)
23227 TryToVectorize(StoreSeq.getStores());
23228
23229 return Changed;
23230}
23231
23232void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23233 // Initialize the collections. We will make a single pass over the block.
23234 Stores.clear();
23235 GEPs.clear();
23236
23237 // Visit the store and getelementptr instructions in BB and organize them in
23238 // Stores and GEPs according to the underlying objects of their pointer
23239 // operands.
23240 for (Instruction &I : *BB) {
23241 // Ignore store instructions that are volatile or have a pointer operand
23242 // that doesn't point to a scalar type.
23243 if (auto *SI = dyn_cast<StoreInst>(&I)) {
23244 if (!SI->isSimple())
23245 continue;
23246 if (!isValidElementType(SI->getValueOperand()->getType()))
23247 continue;
23248 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
23249 }
23250
23251 // Ignore getelementptr instructions that have more than one index, a
23252 // constant index, or a pointer operand that doesn't point to a scalar
23253 // type.
23254 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
23255 if (GEP->getNumIndices() != 1)
23256 continue;
23257 Value *Idx = GEP->idx_begin()->get();
23258 if (isa<Constant>(Idx))
23259 continue;
23260 if (!isValidElementType(Idx->getType()))
23261 continue;
23262 if (GEP->getType()->isVectorTy())
23263 continue;
23264 GEPs[GEP->getPointerOperand()].push_back(GEP);
23265 }
23266 }
23267}
23268
23269bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
23270 bool MaxVFOnly) {
23271 if (VL.size() < 2)
23272 return false;
23273
23274 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
23275 << VL.size() << ".\n");
23276
23277 // Check that all of the parts are instructions of the same type,
23278 // we permit an alternate opcode via InstructionsState.
23279 InstructionsState S = getSameOpcode(VL, *TLI);
23280 if (!S)
23281 return false;
23282
23283 Instruction *I0 = S.getMainOp();
23284 // Make sure invalid types (including vector type) are rejected before
23285 // determining vectorization factor for scalar instructions.
23286 for (Value *V : VL) {
23287 Type *Ty = V->getType();
23289 // NOTE: the following will give user internal llvm type name, which may
23290 // not be useful.
23291 R.getORE()->emit([&]() {
23292 std::string TypeStr;
23293 llvm::raw_string_ostream OS(TypeStr);
23294 Ty->print(OS);
23295 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
23296 << "Cannot SLP vectorize list: type "
23297 << TypeStr + " is unsupported by vectorizer";
23298 });
23299 return false;
23300 }
23301 }
23302
23303 Type *ScalarTy = getValueType(VL[0]);
23304 unsigned Sz = R.getVectorElementSize(I0);
23305 unsigned MinVF = R.getMinVF(Sz);
23306 unsigned MaxVF = std::max<unsigned>(
23307 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
23308 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23309 if (MaxVF < 2) {
23310 R.getORE()->emit([&]() {
23311 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
23312 << "Cannot SLP vectorize list: vectorization factor "
23313 << "less than 2 is not supported";
23314 });
23315 return false;
23316 }
23317
23318 bool Changed = false;
23319 bool CandidateFound = false;
23320 InstructionCost MinCost = SLPCostThreshold.getValue();
23321
23322 unsigned NextInst = 0, MaxInst = VL.size();
23323 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23324 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
23325 // No actual vectorization should happen, if number of parts is the same as
23326 // provided vectorization factor (i.e. the scalar type is used for vector
23327 // code during codegen).
23328 auto *VecTy = getWidenedType(ScalarTy, VF);
23329 if (TTI->getNumberOfParts(VecTy) == VF)
23330 continue;
23331 for (unsigned I = NextInst; I < MaxInst; ++I) {
23332 unsigned ActualVF = std::min(MaxInst - I, VF);
23333
23334 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
23335 continue;
23336
23337 if (MaxVFOnly && ActualVF < MaxVF)
23338 break;
23339 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23340 break;
23341
23342 SmallVector<Value *> Ops(ActualVF, nullptr);
23343 unsigned Idx = 0;
23344 for (Value *V : VL.drop_front(I)) {
23345 // Check that a previous iteration of this loop did not delete the
23346 // Value.
23347 if (auto *Inst = dyn_cast<Instruction>(V);
23348 !Inst || !R.isDeleted(Inst)) {
23349 Ops[Idx] = V;
23350 ++Idx;
23351 if (Idx == ActualVF)
23352 break;
23353 }
23354 }
23355 // Not enough vectorizable instructions - exit.
23356 if (Idx != ActualVF)
23357 break;
23358
23359 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
23360 << "\n");
23361
23362 R.buildTree(Ops);
23363 if (R.isTreeTinyAndNotFullyVectorizable())
23364 continue;
23365 if (R.isProfitableToReorder()) {
23366 R.reorderTopToBottom();
23367 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
23368 }
23369 R.transformNodes();
23370 R.buildExternalUses();
23371
23372 R.computeMinimumValueSizes();
23373 InstructionCost Cost = R.getTreeCost();
23374 CandidateFound = true;
23375 MinCost = std::min(MinCost, Cost);
23376
23377 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
23378 << " for VF=" << ActualVF << "\n");
23379 if (Cost < -SLPCostThreshold) {
23380 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
23381 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
23383 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
23384 << " and with tree size "
23385 << ore::NV("TreeSize", R.getTreeSize()));
23386
23387 R.vectorizeTree();
23388 // Move to the next bundle.
23389 I += VF - 1;
23390 NextInst = I + 1;
23391 Changed = true;
23392 }
23393 }
23394 }
23395
23396 if (!Changed && CandidateFound) {
23397 R.getORE()->emit([&]() {
23398 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
23399 << "List vectorization was possible but not beneficial with cost "
23400 << ore::NV("Cost", MinCost) << " >= "
23401 << ore::NV("Treshold", -SLPCostThreshold);
23402 });
23403 } else if (!Changed) {
23404 R.getORE()->emit([&]() {
23405 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
23406 << "Cannot SLP vectorize list: vectorization was impossible"
23407 << " with available vectorization factors";
23408 });
23409 }
23410 return Changed;
23411}
23412
23413namespace {
23414
23415/// Model horizontal reductions.
23416///
23417/// A horizontal reduction is a tree of reduction instructions that has values
23418/// that can be put into a vector as its leaves. For example:
23419///
23420/// mul mul mul mul
23421/// \ / \ /
23422/// + +
23423/// \ /
23424/// +
23425/// This tree has "mul" as its leaf values and "+" as its reduction
23426/// instructions. A reduction can feed into a store or a binary operation
23427/// feeding a phi.
23428/// ...
23429/// \ /
23430/// +
23431/// |
23432/// phi +=
23433///
23434/// Or:
23435/// ...
23436/// \ /
23437/// +
23438/// |
23439/// *p =
23440///
23441class HorizontalReduction {
23442 using ReductionOpsType = SmallVector<Value *, 16>;
23443 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23444 ReductionOpsListType ReductionOps;
23445 /// List of possibly reduced values.
23447 /// Maps reduced value to the corresponding reduction operation.
23448 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23449 WeakTrackingVH ReductionRoot;
23450 /// The type of reduction operation.
23451 RecurKind RdxKind;
23452 /// Checks if the optimization of original scalar identity operations on
23453 /// matched horizontal reductions is enabled and allowed.
23454 bool IsSupportedHorRdxIdentityOp = false;
23455 /// The minimum number of the reduced values.
23456 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
23457 /// Contains vector values for reduction including their scale factor and
23458 /// signedness.
23460
23461 static bool isCmpSelMinMax(Instruction *I) {
23462 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
23464 }
23465
23466 // And/or are potentially poison-safe logical patterns like:
23467 // select x, y, false
23468 // select x, true, y
23469 static bool isBoolLogicOp(Instruction *I) {
23470 return isa<SelectInst>(I) &&
23471 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
23472 }
23473
23474 /// Checks if instruction is associative and can be vectorized.
23475 static bool isVectorizable(RecurKind Kind, Instruction *I,
23476 bool TwoElementReduction = false) {
23477 if (Kind == RecurKind::None)
23478 return false;
23479
23480 // Integer ops that map to select instructions or intrinsics are fine.
23482 isBoolLogicOp(I))
23483 return true;
23484
23485 // No need to check for associativity, if 2 reduced values.
23486 if (TwoElementReduction)
23487 return true;
23488
23489 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23490 // FP min/max are associative except for NaN and -0.0. We do not
23491 // have to rule out -0.0 here because the intrinsic semantics do not
23492 // specify a fixed result for it.
23493 return I->getFastMathFlags().noNaNs();
23494 }
23495
23496 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23497 return true;
23498
23499 return I->isAssociative();
23500 }
23501
23502 static Value *getRdxOperand(Instruction *I, unsigned Index) {
23503 // Poison-safe 'or' takes the form: select X, true, Y
23504 // To make that work with the normal operand processing, we skip the
23505 // true value operand.
23506 // TODO: Change the code and data structures to handle this without a hack.
23507 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
23508 return I->getOperand(2);
23509 return I->getOperand(Index);
23510 }
23511
23512 /// Creates reduction operation with the current opcode.
23513 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
23514 Value *RHS, const Twine &Name, bool UseSelect) {
23515 Type *OpTy = LHS->getType();
23516 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
23517 switch (Kind) {
23518 case RecurKind::Or: {
23519 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23520 return Builder.CreateSelect(
23521 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
23522 RHS, Name);
23523 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23524 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23525 Name);
23526 }
23527 case RecurKind::And: {
23528 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23529 return Builder.CreateSelect(
23530 LHS, RHS,
23531 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
23532 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23533 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23534 Name);
23535 }
23536 case RecurKind::Add:
23537 case RecurKind::Mul:
23538 case RecurKind::Xor:
23539 case RecurKind::FAdd:
23540 case RecurKind::FMul: {
23541 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23542 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23543 Name);
23544 }
23545 case RecurKind::SMax:
23546 case RecurKind::SMin:
23547 case RecurKind::UMax:
23548 case RecurKind::UMin:
23549 if (UseSelect) {
23551 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
23552 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
23553 }
23554 [[fallthrough]];
23555 case RecurKind::FMax:
23556 case RecurKind::FMin:
23557 case RecurKind::FMaximum:
23558 case RecurKind::FMinimum:
23559 case RecurKind::FMaximumNum:
23560 case RecurKind::FMinimumNum: {
23562 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
23563 }
23564 default:
23565 llvm_unreachable("Unknown reduction operation.");
23566 }
23567 }
23568
23569 /// Creates reduction operation with the current opcode with the IR flags
23570 /// from \p ReductionOps, dropping nuw/nsw flags.
23571 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
23572 Value *RHS, const Twine &Name,
23573 const ReductionOpsListType &ReductionOps) {
23574 bool UseSelect = ReductionOps.size() == 2 ||
23575 // Logical or/and.
23576 (ReductionOps.size() == 1 &&
23577 any_of(ReductionOps.front(), IsaPred<SelectInst>));
23578 assert((!UseSelect || ReductionOps.size() != 2 ||
23579 isa<SelectInst>(ReductionOps[1][0])) &&
23580 "Expected cmp + select pairs for reduction");
23581 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
23583 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
23584 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
23585 /*IncludeWrapFlags=*/false);
23586 propagateIRFlags(Op, ReductionOps[1], nullptr,
23587 /*IncludeWrapFlags=*/false);
23588 return Op;
23589 }
23590 }
23591 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
23592 return Op;
23593 }
23594
23595public:
23596 static RecurKind getRdxKind(Value *V) {
23597 auto *I = dyn_cast<Instruction>(V);
23598 if (!I)
23599 return RecurKind::None;
23600 if (match(I, m_Add(m_Value(), m_Value())))
23601 return RecurKind::Add;
23602 if (match(I, m_Mul(m_Value(), m_Value())))
23603 return RecurKind::Mul;
23604 if (match(I, m_And(m_Value(), m_Value())) ||
23606 return RecurKind::And;
23607 if (match(I, m_Or(m_Value(), m_Value())) ||
23609 return RecurKind::Or;
23610 if (match(I, m_Xor(m_Value(), m_Value())))
23611 return RecurKind::Xor;
23612 if (match(I, m_FAdd(m_Value(), m_Value())))
23613 return RecurKind::FAdd;
23614 if (match(I, m_FMul(m_Value(), m_Value())))
23615 return RecurKind::FMul;
23616
23618 return RecurKind::FMax;
23620 return RecurKind::FMin;
23621
23622 if (match(I, m_FMaximum(m_Value(), m_Value())))
23623 return RecurKind::FMaximum;
23624 if (match(I, m_FMinimum(m_Value(), m_Value())))
23625 return RecurKind::FMinimum;
23626 // This matches either cmp+select or intrinsics. SLP is expected to handle
23627 // either form.
23628 // TODO: If we are canonicalizing to intrinsics, we can remove several
23629 // special-case paths that deal with selects.
23630 if (match(I, m_SMax(m_Value(), m_Value())))
23631 return RecurKind::SMax;
23632 if (match(I, m_SMin(m_Value(), m_Value())))
23633 return RecurKind::SMin;
23634 if (match(I, m_UMax(m_Value(), m_Value())))
23635 return RecurKind::UMax;
23636 if (match(I, m_UMin(m_Value(), m_Value())))
23637 return RecurKind::UMin;
23638
23639 if (auto *Select = dyn_cast<SelectInst>(I)) {
23640 // Try harder: look for min/max pattern based on instructions producing
23641 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
23642 // During the intermediate stages of SLP, it's very common to have
23643 // pattern like this (since optimizeGatherSequence is run only once
23644 // at the end):
23645 // %1 = extractelement <2 x i32> %a, i32 0
23646 // %2 = extractelement <2 x i32> %a, i32 1
23647 // %cond = icmp sgt i32 %1, %2
23648 // %3 = extractelement <2 x i32> %a, i32 0
23649 // %4 = extractelement <2 x i32> %a, i32 1
23650 // %select = select i1 %cond, i32 %3, i32 %4
23651 CmpPredicate Pred;
23652 Instruction *L1;
23653 Instruction *L2;
23654
23655 Value *LHS = Select->getTrueValue();
23656 Value *RHS = Select->getFalseValue();
23657 Value *Cond = Select->getCondition();
23658
23659 // TODO: Support inverse predicates.
23660 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
23663 return RecurKind::None;
23664 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
23667 return RecurKind::None;
23668 } else {
23670 return RecurKind::None;
23671 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
23674 return RecurKind::None;
23675 }
23676
23677 switch (Pred) {
23678 default:
23679 return RecurKind::None;
23680 case CmpInst::ICMP_SGT:
23681 case CmpInst::ICMP_SGE:
23682 return RecurKind::SMax;
23683 case CmpInst::ICMP_SLT:
23684 case CmpInst::ICMP_SLE:
23685 return RecurKind::SMin;
23686 case CmpInst::ICMP_UGT:
23687 case CmpInst::ICMP_UGE:
23688 return RecurKind::UMax;
23689 case CmpInst::ICMP_ULT:
23690 case CmpInst::ICMP_ULE:
23691 return RecurKind::UMin;
23692 }
23693 }
23694 return RecurKind::None;
23695 }
23696
23697 /// Get the index of the first operand.
23698 static unsigned getFirstOperandIndex(Instruction *I) {
23699 return isCmpSelMinMax(I) ? 1 : 0;
23700 }
23701
23702private:
23703 /// Total number of operands in the reduction operation.
23704 static unsigned getNumberOfOperands(Instruction *I) {
23705 return isCmpSelMinMax(I) ? 3 : 2;
23706 }
23707
23708 /// Checks if the instruction is in basic block \p BB.
23709 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
23710 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
23711 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
23712 auto *Sel = cast<SelectInst>(I);
23713 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
23714 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
23715 }
23716 return I->getParent() == BB;
23717 }
23718
23719 /// Expected number of uses for reduction operations/reduced values.
23720 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
23721 if (IsCmpSelMinMax) {
23722 // SelectInst must be used twice while the condition op must have single
23723 // use only.
23724 if (auto *Sel = dyn_cast<SelectInst>(I))
23725 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23726 return I->hasNUses(2);
23727 }
23728
23729 // Arithmetic reduction operation must be used once only.
23730 return I->hasOneUse();
23731 }
23732
23733 /// Initializes the list of reduction operations.
23734 void initReductionOps(Instruction *I) {
23735 if (isCmpSelMinMax(I))
23736 ReductionOps.assign(2, ReductionOpsType());
23737 else
23738 ReductionOps.assign(1, ReductionOpsType());
23739 }
23740
23741 /// Add all reduction operations for the reduction instruction \p I.
23742 void addReductionOps(Instruction *I) {
23743 if (isCmpSelMinMax(I)) {
23744 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
23745 ReductionOps[1].emplace_back(I);
23746 } else {
23747 ReductionOps[0].emplace_back(I);
23748 }
23749 }
23750
23751 static bool isGoodForReduction(ArrayRef<Value *> Data) {
23752 int Sz = Data.size();
23753 auto *I = dyn_cast<Instruction>(Data.front());
23754 return Sz > 1 || isConstant(Data.front()) ||
23755 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
23756 }
23757
23758public:
23759 HorizontalReduction() = default;
23761 : ReductionRoot(I), ReductionLimit(2) {
23762 RdxKind = HorizontalReduction::getRdxKind(I);
23763 ReductionOps.emplace_back().push_back(I);
23764 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
23765 for (Value *V : Ops)
23766 ReducedValsToOps[V].push_back(I);
23767 }
23768
23769 bool matchReductionForOperands() const {
23770 // Analyze "regular" integer/FP types for reductions - no target-specific
23771 // types or pointers.
23772 assert(ReductionRoot && "Reduction root is not set!");
23773 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
23774 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
23775 return Ops.size() == 2;
23776 })))
23777 return false;
23778
23779 return true;
23780 }
23781
23782 /// Try to find a reduction tree.
23783 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23784 ScalarEvolution &SE, const DataLayout &DL,
23785 const TargetLibraryInfo &TLI) {
23786 RdxKind = HorizontalReduction::getRdxKind(Root);
23787 if (!isVectorizable(RdxKind, Root))
23788 return false;
23789
23790 // Analyze "regular" integer/FP types for reductions - no target-specific
23791 // types or pointers.
23792 Type *Ty = Root->getType();
23793 if (!isValidElementType(Ty) || Ty->isPointerTy())
23794 return false;
23795
23796 // Though the ultimate reduction may have multiple uses, its condition must
23797 // have only single use.
23798 if (auto *Sel = dyn_cast<SelectInst>(Root))
23799 if (!Sel->getCondition()->hasOneUse())
23800 return false;
23801
23802 ReductionRoot = Root;
23803
23804 // Iterate through all the operands of the possible reduction tree and
23805 // gather all the reduced values, sorting them by their value id.
23806 BasicBlock *BB = Root->getParent();
23807 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23809 1, std::make_pair(Root, 0));
23810 // Checks if the operands of the \p TreeN instruction are also reduction
23811 // operations or should be treated as reduced values or an extra argument,
23812 // which is not part of the reduction.
23813 auto CheckOperands = [&](Instruction *TreeN,
23814 SmallVectorImpl<Value *> &PossibleReducedVals,
23815 SmallVectorImpl<Instruction *> &ReductionOps,
23816 unsigned Level) {
23817 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
23818 getNumberOfOperands(TreeN)))) {
23819 Value *EdgeVal = getRdxOperand(TreeN, I);
23820 ReducedValsToOps[EdgeVal].push_back(TreeN);
23821 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
23822 // If the edge is not an instruction, or it is different from the main
23823 // reduction opcode or has too many uses - possible reduced value.
23824 // Also, do not try to reduce const values, if the operation is not
23825 // foldable.
23826 if (!EdgeInst || Level > RecursionMaxDepth ||
23827 getRdxKind(EdgeInst) != RdxKind ||
23828 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23829 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23830 !isVectorizable(RdxKind, EdgeInst) ||
23831 (R.isAnalyzedReductionRoot(EdgeInst) &&
23832 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
23833 PossibleReducedVals.push_back(EdgeVal);
23834 continue;
23835 }
23836 ReductionOps.push_back(EdgeInst);
23837 }
23838 };
23839 // Try to regroup reduced values so that it gets more profitable to try to
23840 // reduce them. Values are grouped by their value ids, instructions - by
23841 // instruction op id and/or alternate op id, plus do extra analysis for
23842 // loads (grouping them by the distance between pointers) and cmp
23843 // instructions (grouping them by the predicate).
23844 SmallMapVector<
23845 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
23846 8>
23847 PossibleReducedVals;
23848 initReductionOps(Root);
23849 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
23850 SmallSet<size_t, 2> LoadKeyUsed;
23851
23852 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
23854 Value *Ptr =
23856 if (!LoadKeyUsed.insert(Key).second) {
23857 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
23858 if (LIt != LoadsMap.end()) {
23859 for (LoadInst *RLI : LIt->second) {
23860 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
23861 LI->getType(), LI->getPointerOperand(), DL, SE,
23862 /*StrictCheck=*/true))
23863 return hash_value(RLI->getPointerOperand());
23864 }
23865 for (LoadInst *RLI : LIt->second) {
23867 LI->getPointerOperand(), TLI)) {
23868 hash_code SubKey = hash_value(RLI->getPointerOperand());
23869 return SubKey;
23870 }
23871 }
23872 if (LIt->second.size() > 2) {
23873 hash_code SubKey =
23874 hash_value(LIt->second.back()->getPointerOperand());
23875 return SubKey;
23876 }
23877 }
23878 }
23879 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
23880 .first->second.push_back(LI);
23881 return hash_value(LI->getPointerOperand());
23882 };
23883
23884 while (!Worklist.empty()) {
23885 auto [TreeN, Level] = Worklist.pop_back_val();
23886 SmallVector<Value *> PossibleRedVals;
23887 SmallVector<Instruction *> PossibleReductionOps;
23888 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23889 addReductionOps(TreeN);
23890 // Add reduction values. The values are sorted for better vectorization
23891 // results.
23892 for (Value *V : PossibleRedVals) {
23893 size_t Key, Idx;
23894 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
23895 /*AllowAlternate=*/false);
23896 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
23897 }
23898 for (Instruction *I : reverse(PossibleReductionOps))
23899 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
23900 }
23901 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
23902 // Sort values by the total number of values kinds to start the reduction
23903 // from the longest possible reduced values sequences.
23904 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
23905 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
23906 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
23907 for (auto &Slice : PossibleRedVals) {
23908 PossibleRedValsVect.emplace_back();
23909 auto RedValsVect = Slice.second.takeVector();
23910 stable_sort(RedValsVect, llvm::less_second());
23911 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
23912 PossibleRedValsVect.back().append(Data.second, Data.first);
23913 }
23914 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
23915 return P1.size() > P2.size();
23916 });
23917 bool First = true;
23918 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
23919 if (First) {
23920 First = false;
23921 ReducedVals.emplace_back();
23922 } else if (!isGoodForReduction(Data)) {
23923 auto *LI = dyn_cast<LoadInst>(Data.front());
23924 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
23925 if (!LI || !LastLI ||
23927 getUnderlyingObject(LastLI->getPointerOperand()))
23928 ReducedVals.emplace_back();
23929 }
23930 ReducedVals.back().append(Data.rbegin(), Data.rend());
23931 }
23932 }
23933 // Sort the reduced values by number of same/alternate opcode and/or pointer
23934 // operand.
23935 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
23936 return P1.size() > P2.size();
23937 });
23938 return true;
23939 }
23940
23941 /// Attempt to vectorize the tree found by matchAssociativeReduction.
23942 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
23943 const TargetLibraryInfo &TLI, AssumptionCache *AC,
23944 DominatorTree &DT) {
23945 constexpr unsigned RegMaxNumber = 4;
23946 constexpr unsigned RedValsMaxNumber = 128;
23947 // If there are a sufficient number of reduction values, reduce
23948 // to a nearby power-of-2. We can safely generate oversized
23949 // vectors and rely on the backend to split them to legal sizes.
23950 if (unsigned NumReducedVals = std::accumulate(
23951 ReducedVals.begin(), ReducedVals.end(), 0,
23952 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
23953 if (!isGoodForReduction(Vals))
23954 return Num;
23955 return Num + Vals.size();
23956 });
23957 NumReducedVals < ReductionLimit &&
23958 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
23959 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
23960 })) {
23961 for (ReductionOpsType &RdxOps : ReductionOps)
23962 for (Value *RdxOp : RdxOps)
23963 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
23964 return nullptr;
23965 }
23966
23967 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
23968 TargetFolder(DL));
23969 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
23970
23971 // Track the reduced values in case if they are replaced by extractelement
23972 // because of the vectorization.
23973 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
23974 ReducedVals.front().size());
23975
23976 // The compare instruction of a min/max is the insertion point for new
23977 // instructions and may be replaced with a new compare instruction.
23978 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
23979 assert(isa<SelectInst>(RdxRootInst) &&
23980 "Expected min/max reduction to have select root instruction");
23981 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
23982 assert(isa<Instruction>(ScalarCond) &&
23983 "Expected min/max reduction to have compare condition");
23984 return cast<Instruction>(ScalarCond);
23985 };
23986
23987 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
23988 return isBoolLogicOp(cast<Instruction>(V));
23989 });
23990 // Return new VectorizedTree, based on previous value.
23991 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
23992 if (VectorizedTree) {
23993 // Update the final value in the reduction.
23995 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
23996 if (AnyBoolLogicOp) {
23997 auto It = ReducedValsToOps.find(VectorizedTree);
23998 auto It1 = ReducedValsToOps.find(Res);
23999 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
24000 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
24001 (It != ReducedValsToOps.end() &&
24002 any_of(It->getSecond(), [&](Instruction *I) {
24003 return isBoolLogicOp(I) &&
24004 getRdxOperand(I, 0) == VectorizedTree;
24005 }))) {
24006 ;
24007 } else if (isGuaranteedNotToBePoison(Res, AC) ||
24008 (It1 != ReducedValsToOps.end() &&
24009 any_of(It1->getSecond(), [&](Instruction *I) {
24010 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24011 }))) {
24012 std::swap(VectorizedTree, Res);
24013 } else {
24014 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
24015 }
24016 }
24017
24018 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
24019 ReductionOps);
24020 }
24021 // Initialize the final value in the reduction.
24022 return Res;
24023 };
24024 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24025 ReductionOps.front().size());
24026 for (ReductionOpsType &RdxOps : ReductionOps)
24027 for (Value *RdxOp : RdxOps) {
24028 if (!RdxOp)
24029 continue;
24030 IgnoreList.insert(RdxOp);
24031 }
24032 // Intersect the fast-math-flags from all reduction operations.
24033 FastMathFlags RdxFMF;
24034 RdxFMF.set();
24035 for (Value *U : IgnoreList)
24036 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
24037 RdxFMF &= FPMO->getFastMathFlags();
24038 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
24039
24040 // Need to track reduced vals, they may be changed during vectorization of
24041 // subvectors.
24042 for (ArrayRef<Value *> Candidates : ReducedVals)
24043 for (Value *V : Candidates)
24044 TrackedVals.try_emplace(V, V);
24045
24046 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24047 Value *V) -> unsigned & {
24048 auto *It = MV.find(V);
24049 assert(It != MV.end() && "Unable to find given key.");
24050 return It->second;
24051 };
24052
24053 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
24054 // List of the values that were reduced in other trees as part of gather
24055 // nodes and thus requiring extract if fully vectorized in other trees.
24056 SmallPtrSet<Value *, 4> RequiredExtract;
24057 WeakTrackingVH VectorizedTree = nullptr;
24058 bool CheckForReusedReductionOps = false;
24059 // Try to vectorize elements based on their type.
24061 for (ArrayRef<Value *> RV : ReducedVals)
24062 States.push_back(getSameOpcode(RV, TLI));
24063 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
24064 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
24065 InstructionsState S = States[I];
24066 SmallVector<Value *> Candidates;
24067 Candidates.reserve(2 * OrigReducedVals.size());
24068 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
24069 for (Value *ReducedVal : OrigReducedVals) {
24070 Value *RdxVal = TrackedVals.at(ReducedVal);
24071 // Check if the reduction value was not overriden by the extractelement
24072 // instruction because of the vectorization and exclude it, if it is not
24073 // compatible with other values.
24074 // Also check if the instruction was folded to constant/other value.
24075 auto *Inst = dyn_cast<Instruction>(RdxVal);
24076 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
24077 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24078 (S && !Inst))
24079 continue;
24080 Candidates.push_back(RdxVal);
24081 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24082 }
24083 bool ShuffledExtracts = false;
24084 // Try to handle shuffled extractelements.
24085 if (S && S.getOpcode() == Instruction::ExtractElement &&
24086 !S.isAltShuffle() && I + 1 < E) {
24087 SmallVector<Value *> CommonCandidates(Candidates);
24088 for (Value *RV : ReducedVals[I + 1]) {
24089 Value *RdxVal = TrackedVals.at(RV);
24090 // Check if the reduction value was not overriden by the
24091 // extractelement instruction because of the vectorization and
24092 // exclude it, if it is not compatible with other values.
24093 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
24094 if (!Inst)
24095 continue;
24096 CommonCandidates.push_back(RdxVal);
24097 TrackedToOrig.try_emplace(RdxVal, RV);
24098 }
24099 SmallVector<int> Mask;
24100 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
24101 ++I;
24102 Candidates.swap(CommonCandidates);
24103 ShuffledExtracts = true;
24104 }
24105 }
24106
24107 // Emit code for constant values.
24108 if (Candidates.size() > 1 && allConstant(Candidates)) {
24109 Value *Res = Candidates.front();
24110 Value *OrigV = TrackedToOrig.at(Candidates.front());
24111 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24112 for (Value *VC : ArrayRef(Candidates).drop_front()) {
24113 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
24114 Value *OrigV = TrackedToOrig.at(VC);
24115 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24116 if (auto *ResI = dyn_cast<Instruction>(Res))
24117 V.analyzedReductionRoot(ResI);
24118 }
24119 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24120 continue;
24121 }
24122
24123 unsigned NumReducedVals = Candidates.size();
24124 if (NumReducedVals < ReductionLimit &&
24125 (NumReducedVals < 2 || !isSplat(Candidates)))
24126 continue;
24127
24128 // Check if we support repeated scalar values processing (optimization of
24129 // original scalar identity operations on matched horizontal reductions).
24130 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24131 RdxKind != RecurKind::FMul &&
24132 RdxKind != RecurKind::FMulAdd;
24133 // Gather same values.
24134 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24135 if (IsSupportedHorRdxIdentityOp)
24136 for (Value *V : Candidates) {
24137 Value *OrigV = TrackedToOrig.at(V);
24138 ++SameValuesCounter.try_emplace(OrigV).first->second;
24139 }
24140 // Used to check if the reduced values used same number of times. In this
24141 // case the compiler may produce better code. E.g. if reduced values are
24142 // aabbccdd (8 x values), then the first node of the tree will have a node
24143 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
24144 // Plus, the final reduction will be performed on <8 x aabbccdd>.
24145 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
24146 // x abcd) * 2.
24147 // Currently it only handles add/fadd/xor. and/or/min/max do not require
24148 // this analysis, other operations may require an extra estimation of
24149 // the profitability.
24150 bool SameScaleFactor = false;
24151 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24152 SameValuesCounter.size() != Candidates.size();
24153 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
24154 if (OptReusedScalars) {
24155 SameScaleFactor =
24156 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24157 RdxKind == RecurKind::Xor) &&
24158 all_of(drop_begin(SameValuesCounter),
24159 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
24160 return P.second == SameValuesCounter.front().second;
24161 });
24162 Candidates.resize(SameValuesCounter.size());
24163 transform(SameValuesCounter, Candidates.begin(),
24164 [&](const auto &P) { return TrackedVals.at(P.first); });
24165 NumReducedVals = Candidates.size();
24166 // Have a reduction of the same element.
24167 if (NumReducedVals == 1) {
24168 Value *OrigV = TrackedToOrig.at(Candidates.front());
24169 unsigned Cnt = At(SameValuesCounter, OrigV);
24170 Value *RedVal =
24171 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24172 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24173 VectorizedVals.try_emplace(OrigV, Cnt);
24174 ExternallyUsedValues.insert(OrigV);
24175 continue;
24176 }
24177 }
24178
24179 unsigned MaxVecRegSize = V.getMaxVecRegSize();
24180 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
24181 const unsigned MaxElts = std::clamp<unsigned>(
24182 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
24183 RegMaxNumber * RedValsMaxNumber);
24184
24185 unsigned ReduxWidth = NumReducedVals;
24186 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
24187 unsigned NumParts, NumRegs;
24188 Type *ScalarTy = Candidates.front()->getType();
24189 ReduxWidth =
24190 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
24191 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24192 NumParts = ::getNumberOfParts(TTI, Tp);
24193 NumRegs =
24195 while (NumParts > NumRegs) {
24196 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
24197 ReduxWidth = bit_floor(ReduxWidth - 1);
24198 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24199 NumParts = ::getNumberOfParts(TTI, Tp);
24200 NumRegs =
24202 }
24203 if (NumParts > NumRegs / 2)
24204 ReduxWidth = bit_floor(ReduxWidth);
24205 return ReduxWidth;
24206 };
24207 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
24208 ReduxWidth = GetVectorFactor(ReduxWidth);
24209 ReduxWidth = std::min(ReduxWidth, MaxElts);
24210
24211 unsigned Start = 0;
24212 unsigned Pos = Start;
24213 // Restarts vectorization attempt with lower vector factor.
24214 unsigned PrevReduxWidth = ReduxWidth;
24215 bool CheckForReusedReductionOpsLocal = false;
24216 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
24217 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
24218 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24219 // Check if any of the reduction ops are gathered. If so, worth
24220 // trying again with less number of reduction ops.
24221 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24222 }
24223 ++Pos;
24224 if (Pos < NumReducedVals - ReduxWidth + 1)
24225 return IsAnyRedOpGathered;
24226 Pos = Start;
24227 --ReduxWidth;
24228 if (ReduxWidth > 1)
24229 ReduxWidth = GetVectorFactor(ReduxWidth);
24230 return IsAnyRedOpGathered;
24231 };
24232 bool AnyVectorized = false;
24233 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24234 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24235 ReduxWidth >= ReductionLimit) {
24236 // Dependency in tree of the reduction ops - drop this attempt, try
24237 // later.
24238 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24239 Start == 0) {
24240 CheckForReusedReductionOps = true;
24241 break;
24242 }
24243 PrevReduxWidth = ReduxWidth;
24244 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
24245 // Been analyzed already - skip.
24246 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
24247 (!has_single_bit(ReduxWidth) &&
24248 (IgnoredCandidates.contains(
24249 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
24250 IgnoredCandidates.contains(
24251 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
24252 bit_floor(ReduxWidth))))) ||
24253 V.areAnalyzedReductionVals(VL)) {
24254 (void)AdjustReducedVals(/*IgnoreVL=*/true);
24255 continue;
24256 }
24257 // Early exit if any of the reduction values were deleted during
24258 // previous vectorization attempts.
24259 if (any_of(VL, [&V](Value *RedVal) {
24260 auto *RedValI = dyn_cast<Instruction>(RedVal);
24261 return RedValI && V.isDeleted(RedValI);
24262 }))
24263 break;
24264 V.buildTree(VL, IgnoreList);
24265 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
24266 if (!AdjustReducedVals())
24267 V.analyzedReductionVals(VL);
24268 continue;
24269 }
24270 if (V.isLoadCombineReductionCandidate(RdxKind)) {
24271 if (!AdjustReducedVals())
24272 V.analyzedReductionVals(VL);
24273 continue;
24274 }
24275 V.reorderTopToBottom();
24276 // No need to reorder the root node at all for reassociative reduction.
24277 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
24278 VL.front()->getType()->isIntOrIntVectorTy() ||
24279 ReductionLimit > 2);
24280 // Keep extracted other reduction values, if they are used in the
24281 // vectorization trees.
24282 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
24283 ExternallyUsedValues);
24284 // The reduction root is used as the insertion point for new
24285 // instructions, so set it as externally used to prevent it from being
24286 // deleted.
24287 LocalExternallyUsedValues.insert(ReductionRoot);
24288 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
24289 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
24290 continue;
24291 for (Value *V : ReducedVals[Cnt])
24292 if (isa<Instruction>(V))
24293 LocalExternallyUsedValues.insert(TrackedVals[V]);
24294 }
24295 if (!IsSupportedHorRdxIdentityOp) {
24296 // Number of uses of the candidates in the vector of values.
24297 assert(SameValuesCounter.empty() &&
24298 "Reused values counter map is not empty");
24299 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24300 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24301 continue;
24302 Value *V = Candidates[Cnt];
24303 Value *OrigV = TrackedToOrig.at(V);
24304 ++SameValuesCounter.try_emplace(OrigV).first->second;
24305 }
24306 }
24307 V.transformNodes();
24308 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
24309 // Gather externally used values.
24310 SmallPtrSet<Value *, 4> Visited;
24311 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24312 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24313 continue;
24314 Value *RdxVal = Candidates[Cnt];
24315 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24316 RdxVal = It->second;
24317 if (!Visited.insert(RdxVal).second)
24318 continue;
24319 // Check if the scalar was vectorized as part of the vectorization
24320 // tree but not the top node.
24321 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
24322 LocalExternallyUsedValues.insert(RdxVal);
24323 continue;
24324 }
24325 Value *OrigV = TrackedToOrig.at(RdxVal);
24326 unsigned NumOps =
24327 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24328 if (NumOps != ReducedValsToOps.at(OrigV).size())
24329 LocalExternallyUsedValues.insert(RdxVal);
24330 }
24331 // Do not need the list of reused scalars in regular mode anymore.
24332 if (!IsSupportedHorRdxIdentityOp)
24333 SameValuesCounter.clear();
24334 for (Value *RdxVal : VL)
24335 if (RequiredExtract.contains(RdxVal))
24336 LocalExternallyUsedValues.insert(RdxVal);
24337 V.buildExternalUses(LocalExternallyUsedValues);
24338
24339 V.computeMinimumValueSizes();
24340
24341 // Estimate cost.
24342 InstructionCost ReductionCost =
24343 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
24344 InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
24345 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24346 << " for reduction\n");
24347 if (!Cost.isValid())
24348 break;
24349 if (Cost >= -SLPCostThreshold) {
24350 V.getORE()->emit([&]() {
24351 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
24352 ReducedValsToOps.at(VL[0]).front())
24353 << "Vectorizing horizontal reduction is possible "
24354 << "but not beneficial with cost " << ore::NV("Cost", Cost)
24355 << " and threshold "
24356 << ore::NV("Threshold", -SLPCostThreshold);
24357 });
24358 if (!AdjustReducedVals()) {
24359 V.analyzedReductionVals(VL);
24360 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24361 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
24362 // Add subvectors of VL to the list of the analyzed values.
24363 for (unsigned VF = getFloorFullVectorNumberOfElements(
24364 *TTI, VL.front()->getType(), ReduxWidth - 1);
24365 VF >= ReductionLimit;
24367 *TTI, VL.front()->getType(), VF - 1)) {
24368 if (has_single_bit(VF) &&
24369 V.getCanonicalGraphSize() != V.getTreeSize())
24370 continue;
24371 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24372 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
24373 }
24374 }
24375 }
24376 continue;
24377 }
24378
24379 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
24380 << Cost << ". (HorRdx)\n");
24381 V.getORE()->emit([&]() {
24382 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
24383 ReducedValsToOps.at(VL[0]).front())
24384 << "Vectorized horizontal reduction with cost "
24385 << ore::NV("Cost", Cost) << " and with tree size "
24386 << ore::NV("TreeSize", V.getTreeSize());
24387 });
24388
24389 Builder.setFastMathFlags(RdxFMF);
24390
24391 // Emit a reduction. If the root is a select (min/max idiom), the insert
24392 // point is the compare condition of that select.
24393 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24394 Instruction *InsertPt = RdxRootInst;
24395 if (IsCmpSelMinMax)
24396 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24397
24398 // Vectorize a tree.
24399 Value *VectorizedRoot = V.vectorizeTree(
24400 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24401 // Update TrackedToOrig mapping, since the tracked values might be
24402 // updated.
24403 for (Value *RdxVal : Candidates) {
24404 Value *OrigVal = TrackedToOrig.at(RdxVal);
24405 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24406 if (TransformedRdxVal != RdxVal)
24407 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24408 }
24409
24410 Builder.SetInsertPoint(InsertPt);
24411
24412 // To prevent poison from leaking across what used to be sequential,
24413 // safe, scalar boolean logic operations, the reduction operand must be
24414 // frozen.
24415 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
24416 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
24417
24418 // Emit code to correctly handle reused reduced values, if required.
24419 if (OptReusedScalars && !SameScaleFactor) {
24420 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24421 SameValuesCounter, TrackedToOrig);
24422 }
24423
24424 Type *ScalarTy = VL.front()->getType();
24425 Type *VecTy = VectorizedRoot->getType();
24426 Type *RedScalarTy = VecTy->getScalarType();
24427 VectorValuesAndScales.emplace_back(
24428 VectorizedRoot,
24429 OptReusedScalars && SameScaleFactor
24430 ? SameValuesCounter.front().second
24431 : 1,
24432 RedScalarTy != ScalarTy->getScalarType()
24433 ? V.isSignedMinBitwidthRootNode()
24434 : true);
24435
24436 // Count vectorized reduced values to exclude them from final reduction.
24437 for (Value *RdxVal : VL) {
24438 Value *OrigV = TrackedToOrig.at(RdxVal);
24439 if (IsSupportedHorRdxIdentityOp) {
24440 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24441 continue;
24442 }
24443 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24444 if (!V.isVectorized(RdxVal))
24445 RequiredExtract.insert(RdxVal);
24446 }
24447 Pos += ReduxWidth;
24448 Start = Pos;
24449 ReduxWidth = NumReducedVals - Pos;
24450 if (ReduxWidth > 1)
24451 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24452 AnyVectorized = true;
24453 }
24454 if (OptReusedScalars && !AnyVectorized) {
24455 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
24456 Value *RdxVal = TrackedVals.at(P.first);
24457 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
24458 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24459 VectorizedVals.try_emplace(P.first, P.second);
24460 }
24461 continue;
24462 }
24463 }
24464 if (!VectorValuesAndScales.empty())
24465 VectorizedTree = GetNewVectorizedTree(
24466 VectorizedTree,
24467 emitReduction(Builder, *TTI, ReductionRoot->getType()));
24468
24469 if (!VectorizedTree) {
24470 if (!CheckForReusedReductionOps) {
24471 for (ReductionOpsType &RdxOps : ReductionOps)
24472 for (Value *RdxOp : RdxOps)
24473 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24474 }
24475 return nullptr;
24476 }
24477
24478 // Reorder operands of bool logical op in the natural order to avoid
24479 // possible problem with poison propagation. If not possible to reorder
24480 // (both operands are originally RHS), emit an extra freeze instruction
24481 // for the LHS operand.
24482 // I.e., if we have original code like this:
24483 // RedOp1 = select i1 ?, i1 LHS, i1 false
24484 // RedOp2 = select i1 RHS, i1 ?, i1 false
24485
24486 // Then, we swap LHS/RHS to create a new op that matches the poison
24487 // semantics of the original code.
24488
24489 // If we have original code like this and both values could be poison:
24490 // RedOp1 = select i1 ?, i1 LHS, i1 false
24491 // RedOp2 = select i1 ?, i1 RHS, i1 false
24492
24493 // Then, we must freeze LHS in the new op.
24494 auto FixBoolLogicalOps =
24495 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
24496 Instruction *RedOp2, bool InitStep) {
24497 if (!AnyBoolLogicOp)
24498 return;
24499 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
24500 getRdxOperand(RedOp1, 0) == LHS ||
24502 return;
24503 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
24504 getRdxOperand(RedOp2, 0) == RHS ||
24506 std::swap(LHS, RHS);
24507 return;
24508 }
24509 if (LHS != VectorizedTree)
24510 LHS = Builder.CreateFreeze(LHS);
24511 };
24512 // Finish the reduction.
24513 // Need to add extra arguments and not vectorized possible reduction values.
24514 // Try to avoid dependencies between the scalar remainders after reductions.
24515 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
24516 bool InitStep) {
24517 unsigned Sz = InstVals.size();
24518 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
24519 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
24520 Instruction *RedOp = InstVals[I + 1].first;
24521 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
24522 Value *RdxVal1 = InstVals[I].second;
24523 Value *StableRdxVal1 = RdxVal1;
24524 auto It1 = TrackedVals.find(RdxVal1);
24525 if (It1 != TrackedVals.end())
24526 StableRdxVal1 = It1->second;
24527 Value *RdxVal2 = InstVals[I + 1].second;
24528 Value *StableRdxVal2 = RdxVal2;
24529 auto It2 = TrackedVals.find(RdxVal2);
24530 if (It2 != TrackedVals.end())
24531 StableRdxVal2 = It2->second;
24532 // To prevent poison from leaking across what used to be sequential,
24533 // safe, scalar boolean logic operations, the reduction operand must be
24534 // frozen.
24535 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
24536 RedOp, InitStep);
24537 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24538 StableRdxVal2, "op.rdx", ReductionOps);
24539 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
24540 }
24541 if (Sz % 2 == 1)
24542 ExtraReds[Sz / 2] = InstVals.back();
24543 return ExtraReds;
24544 };
24546 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
24547 VectorizedTree);
24548 SmallPtrSet<Value *, 8> Visited;
24549 for (ArrayRef<Value *> Candidates : ReducedVals) {
24550 for (Value *RdxVal : Candidates) {
24551 if (!Visited.insert(RdxVal).second)
24552 continue;
24553 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24554 for (Instruction *RedOp :
24555 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
24556 ExtraReductions.emplace_back(RedOp, RdxVal);
24557 }
24558 }
24559 // Iterate through all not-vectorized reduction values/extra arguments.
24560 bool InitStep = true;
24561 while (ExtraReductions.size() > 1) {
24563 FinalGen(ExtraReductions, InitStep);
24564 ExtraReductions.swap(NewReds);
24565 InitStep = false;
24566 }
24567 VectorizedTree = ExtraReductions.front().second;
24568
24569 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24570
24571 // The original scalar reduction is expected to have no remaining
24572 // uses outside the reduction tree itself. Assert that we got this
24573 // correct, replace internal uses with undef, and mark for eventual
24574 // deletion.
24575#ifndef NDEBUG
24576 SmallPtrSet<Value *, 4> IgnoreSet;
24577 for (ArrayRef<Value *> RdxOps : ReductionOps)
24578 IgnoreSet.insert_range(RdxOps);
24579#endif
24580 for (ArrayRef<Value *> RdxOps : ReductionOps) {
24581 for (Value *Ignore : RdxOps) {
24582 if (!Ignore)
24583 continue;
24584#ifndef NDEBUG
24585 for (auto *U : Ignore->users()) {
24586 assert(IgnoreSet.count(U) &&
24587 "All users must be either in the reduction ops list.");
24588 }
24589#endif
24590 if (!Ignore->use_empty()) {
24591 Value *P = PoisonValue::get(Ignore->getType());
24592 Ignore->replaceAllUsesWith(P);
24593 }
24594 }
24595 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24596 }
24597 return VectorizedTree;
24598 }
24599
24600private:
24601 /// Creates the reduction from the given \p Vec vector value with the given
24602 /// scale \p Scale and signedness \p IsSigned.
24603 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24604 Value *Vec, unsigned Scale, bool IsSigned,
24605 Type *DestTy) {
24606 Value *Rdx;
24607 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
24608 unsigned DestTyNumElements = getNumElements(VecTy);
24609 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
24610 Rdx = PoisonValue::get(
24611 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
24612 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
24613 // Do reduction for each lane.
24614 // e.g., do reduce add for
24615 // VL[0] = <4 x Ty> <a, b, c, d>
24616 // VL[1] = <4 x Ty> <e, f, g, h>
24617 // Lane[0] = <2 x Ty> <a, e>
24618 // Lane[1] = <2 x Ty> <b, f>
24619 // Lane[2] = <2 x Ty> <c, g>
24620 // Lane[3] = <2 x Ty> <d, h>
24621 // result[0] = reduce add Lane[0]
24622 // result[1] = reduce add Lane[1]
24623 // result[2] = reduce add Lane[2]
24624 // result[3] = reduce add Lane[3]
24625 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
24626 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
24627 Rdx = Builder.CreateInsertElement(
24628 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
24629 }
24630 } else {
24631 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
24632 }
24633 if (Rdx->getType() != DestTy)
24634 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
24635 // Improved analysis for add/fadd/xor reductions with same scale
24636 // factor for all operands of reductions. We can emit scalar ops for
24637 // them instead.
24638 if (Scale > 1)
24639 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24640 return Rdx;
24641 }
24642
24643 /// Calculate the cost of a reduction.
24644 InstructionCost getReductionCost(TargetTransformInfo *TTI,
24645 ArrayRef<Value *> ReducedVals,
24646 bool IsCmpSelMinMax, FastMathFlags FMF,
24647 const BoUpSLP &R, DominatorTree &DT,
24648 const DataLayout &DL,
24649 const TargetLibraryInfo &TLI) {
24651 Type *ScalarTy = ReducedVals.front()->getType();
24652 unsigned ReduxWidth = ReducedVals.size();
24653 FixedVectorType *VectorTy = R.getReductionType();
24654 InstructionCost VectorCost = 0, ScalarCost;
24655 // If all of the reduced values are constant, the vector cost is 0, since
24656 // the reduction value can be calculated at the compile time.
24657 bool AllConsts = allConstant(ReducedVals);
24658 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
24660 // Scalar cost is repeated for N-1 elements.
24661 int Cnt = ReducedVals.size();
24662 for (Value *RdxVal : ReducedVals) {
24663 if (Cnt == 1)
24664 break;
24665 --Cnt;
24666 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
24667 Cost += GenCostFn();
24668 continue;
24669 }
24670 InstructionCost ScalarCost = 0;
24671 for (User *U : RdxVal->users()) {
24672 auto *RdxOp = cast<Instruction>(U);
24673 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24674 if (RdxKind == RecurKind::FAdd) {
24676 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
24677 if (FMACost.isValid()) {
24678 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
24679 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
24680 // Also, exclude scalar fmul cost.
24681 InstructionCost FMulCost =
24683 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
24684 FMACost -= FMulCost;
24685 }
24686 ScalarCost += FMACost;
24687 continue;
24688 }
24689 }
24690 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
24691 continue;
24692 }
24693 ScalarCost = InstructionCost::getInvalid();
24694 break;
24695 }
24696 if (ScalarCost.isValid())
24697 Cost += ScalarCost;
24698 else
24699 Cost += GenCostFn();
24700 }
24701 return Cost;
24702 };
24703 // Require reduction cost if:
24704 // 1. This type is not a full register type and no other vectors with the
24705 // same type in the storage (first vector with small type).
24706 // 2. The storage does not have any vector with full vector use (first
24707 // vector with full register use).
24708 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
24709 switch (RdxKind) {
24710 case RecurKind::Add:
24711 case RecurKind::Mul:
24712 case RecurKind::Or:
24713 case RecurKind::And:
24714 case RecurKind::Xor:
24715 case RecurKind::FAdd:
24716 case RecurKind::FMul: {
24717 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
24718 if (!AllConsts) {
24719 if (DoesRequireReductionOp) {
24720 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
24721 assert(SLPReVec && "FixedVectorType is not expected.");
24722 unsigned ScalarTyNumElements = VecTy->getNumElements();
24723 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
24724 VectorCost += TTI->getShuffleCost(
24727 ReducedVals.size()),
24728 VectorTy,
24729 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
24730 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
24731 FMF, CostKind);
24732 }
24733 VectorCost += TTI->getScalarizationOverhead(
24734 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
24735 /*Extract*/ false, TTI::TCK_RecipThroughput);
24736 } else {
24737 Type *RedTy = VectorTy->getElementType();
24738 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24739 std::make_pair(RedTy, true));
24740 if (RType == RedTy) {
24741 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
24742 FMF, CostKind);
24743 } else {
24744 VectorCost = TTI->getExtendedReductionCost(
24745 RdxOpcode, !IsSigned, RedTy,
24746 getWidenedType(RType, ReduxWidth), FMF, CostKind);
24747 }
24748 }
24749 } else {
24750 Type *RedTy = VectorTy->getElementType();
24751 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24752 std::make_pair(RedTy, true));
24753 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24754 InstructionCost FMACost = InstructionCost::getInvalid();
24755 if (RdxKind == RecurKind::FAdd) {
24756 // Check if the reduction operands can be converted to FMA.
24758 FastMathFlags FMF;
24759 FMF.set();
24760 for (Value *RdxVal : ReducedVals) {
24761 if (!RdxVal->hasOneUse()) {
24762 Ops.clear();
24763 break;
24764 }
24765 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
24766 FMF &= FPCI->getFastMathFlags();
24767 Ops.push_back(RdxVal->user_back());
24768 }
24769 if (!Ops.empty()) {
24770 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
24771 *TTI, TLI);
24772 if (FMACost.isValid()) {
24773 // Calculate actual FMAD cost.
24774 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24775 {RVecTy, RVecTy, RVecTy}, FMF);
24776 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
24777
24778 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
24779 // Also, exclude vector fmul cost.
24781 Instruction::FMul, RVecTy, CostKind);
24783 << "Minus vector FMul cost: " << FMulCost << "\n");
24784 FMACost -= FMulCost;
24785 }
24786 }
24787 }
24788 if (FMACost.isValid())
24789 VectorCost += FMACost;
24790 else
24791 VectorCost +=
24792 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
24793 if (RType != RedTy) {
24794 unsigned Opcode = Instruction::Trunc;
24795 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24796 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24797 VectorCost += TTI->getCastInstrCost(
24798 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24799 }
24800 }
24801 }
24802 ScalarCost = EvaluateScalarCost([&]() {
24803 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
24804 });
24805 break;
24806 }
24807 case RecurKind::FMax:
24808 case RecurKind::FMin:
24809 case RecurKind::FMaximum:
24810 case RecurKind::FMinimum:
24811 case RecurKind::SMax:
24812 case RecurKind::SMin:
24813 case RecurKind::UMax:
24814 case RecurKind::UMin: {
24816 if (!AllConsts) {
24817 if (DoesRequireReductionOp) {
24818 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
24819 } else {
24820 // Check if the previous reduction already exists and account it as
24821 // series of operations + single reduction.
24822 Type *RedTy = VectorTy->getElementType();
24823 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24824 std::make_pair(RedTy, true));
24825 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24826 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24827 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
24828 if (RType != RedTy) {
24829 unsigned Opcode = Instruction::Trunc;
24830 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24831 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24832 VectorCost += TTI->getCastInstrCost(
24833 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24834 }
24835 }
24836 }
24837 ScalarCost = EvaluateScalarCost([&]() {
24838 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24839 return TTI->getIntrinsicInstrCost(ICA, CostKind);
24840 });
24841 break;
24842 }
24843 default:
24844 llvm_unreachable("Expected arithmetic or min/max reduction operation");
24845 }
24846
24847 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
24848 << " for reduction of " << shortBundleName(ReducedVals)
24849 << " (It is a splitting reduction)\n");
24850 return VectorCost - ScalarCost;
24851 }
24852
24853 /// Splits the values, stored in VectorValuesAndScales, into registers/free
24854 /// sub-registers, combines them with the given reduction operation as a
24855 /// vector operation and then performs single (small enough) reduction.
24856 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24857 Type *DestTy) {
24858 Value *ReducedSubTree = nullptr;
24859 // Creates reduction and combines with the previous reduction.
24860 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
24861 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
24862 if (ReducedSubTree)
24863 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24864 "op.rdx", ReductionOps);
24865 else
24866 ReducedSubTree = Rdx;
24867 };
24868 if (VectorValuesAndScales.size() == 1) {
24869 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
24870 CreateSingleOp(Vec, Scale, IsSigned);
24871 return ReducedSubTree;
24872 }
24873 // Scales Vec using given Cnt scale factor and then performs vector combine
24874 // with previous value of VecOp.
24875 Value *VecRes = nullptr;
24876 bool VecResSignedness = false;
24877 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
24878 Type *ScalarTy = Vec->getType()->getScalarType();
24879 // Scale Vec using given Cnt scale factor.
24880 if (Cnt > 1) {
24881 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
24882 switch (RdxKind) {
24883 case RecurKind::Add: {
24884 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
24885 unsigned VF = getNumElements(Vec->getType());
24886 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
24887 << ". (HorRdx)\n");
24888 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
24889 for (unsigned I : seq<unsigned>(Cnt))
24890 std::iota(std::next(Mask.begin(), VF * I),
24891 std::next(Mask.begin(), VF * (I + 1)), 0);
24892 ++NumVectorInstructions;
24893 Vec = Builder.CreateShuffleVector(Vec, Mask);
24894 break;
24895 }
24896 // res = mul vv, n
24897 if (ScalarTy != DestTy->getScalarType())
24898 Vec = Builder.CreateIntCast(
24899 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
24900 IsSigned);
24902 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
24903 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
24904 << ". (HorRdx)\n");
24905 ++NumVectorInstructions;
24906 Vec = Builder.CreateMul(Vec, Scale);
24907 break;
24908 }
24909 case RecurKind::Xor: {
24910 // res = n % 2 ? 0 : vv
24912 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
24913 if (Cnt % 2 == 0)
24914 Vec = Constant::getNullValue(Vec->getType());
24915 break;
24916 }
24917 case RecurKind::FAdd: {
24918 // res = fmul v, n
24919 Value *Scale =
24920 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
24921 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
24922 << ". (HorRdx)\n");
24923 ++NumVectorInstructions;
24924 Vec = Builder.CreateFMul(Vec, Scale);
24925 break;
24926 }
24927 case RecurKind::And:
24928 case RecurKind::Or:
24929 case RecurKind::SMax:
24930 case RecurKind::SMin:
24931 case RecurKind::UMax:
24932 case RecurKind::UMin:
24933 case RecurKind::FMax:
24934 case RecurKind::FMin:
24935 case RecurKind::FMaximum:
24936 case RecurKind::FMinimum:
24937 // res = vv
24938 break;
24939 case RecurKind::Sub:
24940 case RecurKind::AddChainWithSubs:
24941 case RecurKind::Mul:
24942 case RecurKind::FMul:
24943 case RecurKind::FMulAdd:
24944 case RecurKind::AnyOf:
24945 case RecurKind::FindFirstIVSMin:
24946 case RecurKind::FindFirstIVUMin:
24947 case RecurKind::FindLastIVSMax:
24948 case RecurKind::FindLastIVUMax:
24949 case RecurKind::FMaxNum:
24950 case RecurKind::FMinNum:
24951 case RecurKind::FMaximumNum:
24952 case RecurKind::FMinimumNum:
24953 case RecurKind::None:
24954 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
24955 }
24956 }
24957 // Combine Vec with the previous VecOp.
24958 if (!VecRes) {
24959 VecRes = Vec;
24960 VecResSignedness = IsSigned;
24961 } else {
24962 ++NumVectorInstructions;
24963 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
24964 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
24965 // Handle ctpop.
24966 unsigned VecResVF = getNumElements(VecRes->getType());
24967 unsigned VecVF = getNumElements(Vec->getType());
24968 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
24969 std::iota(Mask.begin(), Mask.end(), 0);
24970 // Ensure that VecRes is always larger than Vec
24971 if (VecResVF < VecVF) {
24972 std::swap(VecRes, Vec);
24973 std::swap(VecResVF, VecVF);
24974 }
24975 if (VecResVF != VecVF) {
24976 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
24977 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
24978 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
24979 }
24980 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
24981 return;
24982 }
24983 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
24984 VecRes = Builder.CreateIntCast(
24985 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
24986 VecResSignedness);
24987 if (ScalarTy != DestTy->getScalarType())
24988 Vec = Builder.CreateIntCast(
24989 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
24990 IsSigned);
24991 unsigned VecResVF = getNumElements(VecRes->getType());
24992 unsigned VecVF = getNumElements(Vec->getType());
24993 // Ensure that VecRes is always larger than Vec
24994 if (VecResVF < VecVF) {
24995 std::swap(VecRes, Vec);
24996 std::swap(VecResVF, VecVF);
24997 }
24998 // extract + op + insert
24999 Value *Op = VecRes;
25000 if (VecResVF != VecVF)
25001 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
25002 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
25003 if (VecResVF != VecVF)
25004 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
25005 VecRes = Op;
25006 }
25007 };
25008 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25009 CreateVecOp(Vec, Scale, IsSigned);
25010 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
25011
25012 return ReducedSubTree;
25013 }
25014
25015 /// Emit a horizontal reduction of the vectorized value.
25016 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
25017 const TargetTransformInfo *TTI, Type *DestTy) {
25018 assert(VectorizedValue && "Need to have a vectorized tree node");
25019 assert(RdxKind != RecurKind::FMulAdd &&
25020 "A call to the llvm.fmuladd intrinsic is not handled yet");
25021
25022 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
25023 if (FTy->getScalarType() == Builder.getInt1Ty() &&
25024 RdxKind == RecurKind::Add &&
25025 DestTy->getScalarType() != FTy->getScalarType()) {
25026 // Convert vector_reduce_add(ZExt(<n x i1>)) to
25027 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
25028 Value *V = Builder.CreateBitCast(
25029 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
25030 ++NumVectorInstructions;
25031 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
25032 }
25033 ++NumVectorInstructions;
25034 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
25035 }
25036
25037 /// Emits optimized code for unique scalar value reused \p Cnt times.
25038 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
25039 unsigned Cnt) {
25040 assert(IsSupportedHorRdxIdentityOp &&
25041 "The optimization of matched scalar identity horizontal reductions "
25042 "must be supported.");
25043 if (Cnt == 1)
25044 return VectorizedValue;
25045 switch (RdxKind) {
25046 case RecurKind::Add: {
25047 // res = mul vv, n
25048 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
25049 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
25050 << VectorizedValue << ". (HorRdx)\n");
25051 return Builder.CreateMul(VectorizedValue, Scale);
25052 }
25053 case RecurKind::Xor: {
25054 // res = n % 2 ? 0 : vv
25055 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
25056 << ". (HorRdx)\n");
25057 if (Cnt % 2 == 0)
25058 return Constant::getNullValue(VectorizedValue->getType());
25059 return VectorizedValue;
25060 }
25061 case RecurKind::FAdd: {
25062 // res = fmul v, n
25063 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
25064 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
25065 << VectorizedValue << ". (HorRdx)\n");
25066 return Builder.CreateFMul(VectorizedValue, Scale);
25067 }
25068 case RecurKind::And:
25069 case RecurKind::Or:
25070 case RecurKind::SMax:
25071 case RecurKind::SMin:
25072 case RecurKind::UMax:
25073 case RecurKind::UMin:
25074 case RecurKind::FMax:
25075 case RecurKind::FMin:
25076 case RecurKind::FMaximum:
25077 case RecurKind::FMinimum:
25078 // res = vv
25079 return VectorizedValue;
25080 case RecurKind::Sub:
25081 case RecurKind::AddChainWithSubs:
25082 case RecurKind::Mul:
25083 case RecurKind::FMul:
25084 case RecurKind::FMulAdd:
25085 case RecurKind::AnyOf:
25086 case RecurKind::FindFirstIVSMin:
25087 case RecurKind::FindFirstIVUMin:
25088 case RecurKind::FindLastIVSMax:
25089 case RecurKind::FindLastIVUMax:
25090 case RecurKind::FMaxNum:
25091 case RecurKind::FMinNum:
25092 case RecurKind::FMaximumNum:
25093 case RecurKind::FMinimumNum:
25094 case RecurKind::None:
25095 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25096 }
25097 return nullptr;
25098 }
25099
25100 /// Emits actual operation for the scalar identity values, found during
25101 /// horizontal reduction analysis.
25102 Value *
25103 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25104 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25105 const DenseMap<Value *, Value *> &TrackedToOrig) {
25106 assert(IsSupportedHorRdxIdentityOp &&
25107 "The optimization of matched scalar identity horizontal reductions "
25108 "must be supported.");
25109 ArrayRef<Value *> VL = R.getRootNodeScalars();
25110 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
25111 if (VTy->getElementType() != VL.front()->getType()) {
25112 VectorizedValue = Builder.CreateIntCast(
25113 VectorizedValue,
25114 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
25115 R.isSignedMinBitwidthRootNode());
25116 }
25117 switch (RdxKind) {
25118 case RecurKind::Add: {
25119 // root = mul prev_root, <1, 1, n, 1>
25121 for (Value *V : VL) {
25122 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25123 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
25124 }
25125 auto *Scale = ConstantVector::get(Vals);
25126 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
25127 << VectorizedValue << ". (HorRdx)\n");
25128 return Builder.CreateMul(VectorizedValue, Scale);
25129 }
25130 case RecurKind::And:
25131 case RecurKind::Or:
25132 // No need for multiple or/and(s).
25133 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
25134 << ". (HorRdx)\n");
25135 return VectorizedValue;
25136 case RecurKind::SMax:
25137 case RecurKind::SMin:
25138 case RecurKind::UMax:
25139 case RecurKind::UMin:
25140 case RecurKind::FMax:
25141 case RecurKind::FMin:
25142 case RecurKind::FMaximum:
25143 case RecurKind::FMinimum:
25144 // No need for multiple min/max(s) of the same value.
25145 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
25146 << ". (HorRdx)\n");
25147 return VectorizedValue;
25148 case RecurKind::Xor: {
25149 // Replace values with even number of repeats with 0, since
25150 // x xor x = 0.
25151 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
25152 // 7>, if elements 4th and 6th elements have even number of repeats.
25153 SmallVector<int> Mask(
25154 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
25156 std::iota(Mask.begin(), Mask.end(), 0);
25157 bool NeedShuffle = false;
25158 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
25159 Value *V = VL[I];
25160 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25161 if (Cnt % 2 == 0) {
25162 Mask[I] = VF;
25163 NeedShuffle = true;
25164 }
25165 }
25166 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
25167 : Mask) dbgs()
25168 << I << " ";
25169 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
25170 if (NeedShuffle)
25171 VectorizedValue = Builder.CreateShuffleVector(
25172 VectorizedValue,
25173 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
25174 return VectorizedValue;
25175 }
25176 case RecurKind::FAdd: {
25177 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
25179 for (Value *V : VL) {
25180 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25181 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
25182 }
25183 auto *Scale = ConstantVector::get(Vals);
25184 return Builder.CreateFMul(VectorizedValue, Scale);
25185 }
25186 case RecurKind::Sub:
25187 case RecurKind::AddChainWithSubs:
25188 case RecurKind::Mul:
25189 case RecurKind::FMul:
25190 case RecurKind::FMulAdd:
25191 case RecurKind::AnyOf:
25192 case RecurKind::FindFirstIVSMin:
25193 case RecurKind::FindFirstIVUMin:
25194 case RecurKind::FindLastIVSMax:
25195 case RecurKind::FindLastIVUMax:
25196 case RecurKind::FMaxNum:
25197 case RecurKind::FMinNum:
25198 case RecurKind::FMaximumNum:
25199 case RecurKind::FMinimumNum:
25200 case RecurKind::None:
25201 llvm_unreachable("Unexpected reduction kind for reused scalars.");
25202 }
25203 return nullptr;
25204 }
25205};
25206} // end anonymous namespace
25207
25208/// Gets recurrence kind from the specified value.
25210 return HorizontalReduction::getRdxKind(V);
25211}
25212static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
25213 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25214 return cast<FixedVectorType>(IE->getType())->getNumElements();
25215
25216 unsigned AggregateSize = 1;
25217 auto *IV = cast<InsertValueInst>(InsertInst);
25218 Type *CurrentType = IV->getType();
25219 do {
25220 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
25221 for (auto *Elt : ST->elements())
25222 if (Elt != ST->getElementType(0)) // check homogeneity
25223 return std::nullopt;
25224 AggregateSize *= ST->getNumElements();
25225 CurrentType = ST->getElementType(0);
25226 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25227 AggregateSize *= AT->getNumElements();
25228 CurrentType = AT->getElementType();
25229 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25230 AggregateSize *= VT->getNumElements();
25231 return AggregateSize;
25232 } else if (CurrentType->isSingleValueType()) {
25233 return AggregateSize;
25234 } else {
25235 return std::nullopt;
25236 }
25237 } while (true);
25238}
25239
25240static void findBuildAggregateRec(Instruction *LastInsertInst,
25242 SmallVectorImpl<Value *> &BuildVectorOpds,
25243 SmallVectorImpl<Value *> &InsertElts,
25244 unsigned OperandOffset, const BoUpSLP &R) {
25245 do {
25246 Value *InsertedOperand = LastInsertInst->getOperand(1);
25247 std::optional<unsigned> OperandIndex =
25248 getElementIndex(LastInsertInst, OperandOffset);
25249 if (!OperandIndex || R.isDeleted(LastInsertInst))
25250 return;
25251 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25253 BuildVectorOpds, InsertElts, *OperandIndex, R);
25254
25255 } else {
25256 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25257 InsertElts[*OperandIndex] = LastInsertInst;
25258 }
25259 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
25260 } while (LastInsertInst != nullptr &&
25262 LastInsertInst->hasOneUse());
25263}
25264
25265/// Recognize construction of vectors like
25266/// %ra = insertelement <4 x float> poison, float %s0, i32 0
25267/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
25268/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
25269/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
25270/// starting from the last insertelement or insertvalue instruction.
25271///
25272/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
25273/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
25274/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
25275///
25276/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
25277///
25278/// \return true if it matches.
25279static bool findBuildAggregate(Instruction *LastInsertInst,
25281 SmallVectorImpl<Value *> &BuildVectorOpds,
25282 SmallVectorImpl<Value *> &InsertElts,
25283 const BoUpSLP &R) {
25284
25285 assert((isa<InsertElementInst>(LastInsertInst) ||
25286 isa<InsertValueInst>(LastInsertInst)) &&
25287 "Expected insertelement or insertvalue instruction!");
25288
25289 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
25290 "Expected empty result vectors!");
25291
25292 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
25293 if (!AggregateSize)
25294 return false;
25295 BuildVectorOpds.resize(*AggregateSize);
25296 InsertElts.resize(*AggregateSize);
25297
25298 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
25299 llvm::erase(BuildVectorOpds, nullptr);
25300 llvm::erase(InsertElts, nullptr);
25301 if (BuildVectorOpds.size() >= 2)
25302 return true;
25303
25304 return false;
25305}
25306
25307/// Try and get a reduction instruction from a phi node.
25308///
25309/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
25310/// if they come from either \p ParentBB or a containing loop latch.
25311///
25312/// \returns A candidate reduction value if possible, or \code nullptr \endcode
25313/// if not possible.
25315 BasicBlock *ParentBB, LoopInfo *LI) {
25316 // There are situations where the reduction value is not dominated by the
25317 // reduction phi. Vectorizing such cases has been reported to cause
25318 // miscompiles. See PR25787.
25319 auto DominatedReduxValue = [&](Value *R) {
25320 return isa<Instruction>(R) &&
25321 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
25322 };
25323
25324 Instruction *Rdx = nullptr;
25325
25326 // Return the incoming value if it comes from the same BB as the phi node.
25327 if (P->getIncomingBlock(0) == ParentBB) {
25328 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25329 } else if (P->getIncomingBlock(1) == ParentBB) {
25330 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25331 }
25332
25333 if (Rdx && DominatedReduxValue(Rdx))
25334 return Rdx;
25335
25336 // Otherwise, check whether we have a loop latch to look at.
25337 Loop *BBL = LI->getLoopFor(ParentBB);
25338 if (!BBL)
25339 return nullptr;
25340 BasicBlock *BBLatch = BBL->getLoopLatch();
25341 if (!BBLatch)
25342 return nullptr;
25343
25344 // There is a loop latch, return the incoming value if it comes from
25345 // that. This reduction pattern occasionally turns up.
25346 if (P->getIncomingBlock(0) == BBLatch) {
25347 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25348 } else if (P->getIncomingBlock(1) == BBLatch) {
25349 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25350 }
25351
25352 if (Rdx && DominatedReduxValue(Rdx))
25353 return Rdx;
25354
25355 return nullptr;
25356}
25357
25358static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
25359 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
25360 return true;
25361 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
25362 return true;
25363 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
25364 return true;
25365 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
25366 return true;
25367 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
25368 return true;
25370 return true;
25372 return true;
25374 return true;
25376 return true;
25377 return false;
25378}
25379
25380/// We could have an initial reduction that is not an add.
25381/// r *= v1 + v2 + v3 + v4
25382/// In such a case start looking for a tree rooted in the first '+'.
25383/// \Returns the new root if found, which may be nullptr if not an instruction.
25385 Instruction *Root) {
25386 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25387 isa<IntrinsicInst>(Root)) &&
25388 "Expected binop, select, or intrinsic for reduction matching");
25389 Value *LHS =
25390 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25391 Value *RHS =
25392 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25393 if (LHS == Phi)
25394 return dyn_cast<Instruction>(RHS);
25395 if (RHS == Phi)
25396 return dyn_cast<Instruction>(LHS);
25397 return nullptr;
25398}
25399
25400/// \p Returns the first operand of \p I that does not match \p Phi. If
25401/// operand is not an instruction it returns nullptr.
25403 Value *Op0 = nullptr;
25404 Value *Op1 = nullptr;
25405 if (!matchRdxBop(I, Op0, Op1))
25406 return nullptr;
25407 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25408}
25409
25410/// \Returns true if \p I is a candidate instruction for reduction vectorization.
25412 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
25413 Value *B0 = nullptr, *B1 = nullptr;
25414 bool IsBinop = matchRdxBop(I, B0, B1);
25415 return IsBinop || IsSelect;
25416}
25417
25418bool SLPVectorizerPass::vectorizeHorReduction(
25419 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
25420 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25421 if (!ShouldVectorizeHor)
25422 return false;
25423 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
25424
25425 if (Root->getParent() != BB || isa<PHINode>(Root))
25426 return false;
25427
25428 // If we can find a secondary reduction root, use that instead.
25429 auto SelectRoot = [&]() {
25430 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
25431 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25432 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
25433 return NewRoot;
25434 return Root;
25435 };
25436
25437 // Start analysis starting from Root instruction. If horizontal reduction is
25438 // found, try to vectorize it. If it is not a horizontal reduction or
25439 // vectorization is not possible or not effective, and currently analyzed
25440 // instruction is a binary operation, try to vectorize the operands, using
25441 // pre-order DFS traversal order. If the operands were not vectorized, repeat
25442 // the same procedure considering each operand as a possible root of the
25443 // horizontal reduction.
25444 // Interrupt the process if the Root instruction itself was vectorized or all
25445 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
25446 // If a horizintal reduction was not matched or vectorized we collect
25447 // instructions for possible later attempts for vectorization.
25448 std::queue<std::pair<Instruction *, unsigned>> Stack;
25449 Stack.emplace(SelectRoot(), 0);
25450 SmallPtrSet<Value *, 8> VisitedInstrs;
25451 bool Res = false;
25452 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
25453 if (R.isAnalyzedReductionRoot(Inst))
25454 return nullptr;
25455 if (!isReductionCandidate(Inst))
25456 return nullptr;
25457 HorizontalReduction HorRdx;
25458 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25459 return nullptr;
25460 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25461 };
25462 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
25463 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25464 FutureSeed = getNonPhiOperand(Root, P);
25465 if (!FutureSeed)
25466 return false;
25467 }
25468 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
25469 // analysis is done separately.
25471 PostponedInsts.push_back(FutureSeed);
25472 return true;
25473 };
25474
25475 while (!Stack.empty()) {
25476 Instruction *Inst;
25477 unsigned Level;
25478 std::tie(Inst, Level) = Stack.front();
25479 Stack.pop();
25480 // Do not try to analyze instruction that has already been vectorized.
25481 // This may happen when we vectorize instruction operands on a previous
25482 // iteration while stack was populated before that happened.
25483 if (R.isDeleted(Inst))
25484 continue;
25485 if (Value *VectorizedV = TryToReduce(Inst)) {
25486 Res = true;
25487 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
25488 // Try to find another reduction.
25489 Stack.emplace(I, Level);
25490 continue;
25491 }
25492 if (R.isDeleted(Inst))
25493 continue;
25494 } else {
25495 // We could not vectorize `Inst` so try to use it as a future seed.
25496 if (!TryAppendToPostponedInsts(Inst)) {
25497 assert(Stack.empty() && "Expected empty stack");
25498 break;
25499 }
25500 }
25501
25502 // Try to vectorize operands.
25503 // Continue analysis for the instruction from the same basic block only to
25504 // save compile time.
25505 if (++Level < RecursionMaxDepth)
25506 for (auto *Op : Inst->operand_values())
25507 if (VisitedInstrs.insert(Op).second)
25508 if (auto *I = dyn_cast<Instruction>(Op))
25509 // Do not try to vectorize CmpInst operands, this is done
25510 // separately.
25512 !R.isDeleted(I) && I->getParent() == BB)
25513 Stack.emplace(I, Level);
25514 }
25515 return Res;
25516}
25517
25518bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
25519 if (!I)
25520 return false;
25521
25522 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
25523 return false;
25524 // Skip potential FMA candidates.
25525 if ((I->getOpcode() == Instruction::FAdd ||
25526 I->getOpcode() == Instruction::FSub) &&
25527 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
25528 .isValid())
25529 return false;
25530
25531 Value *P = I->getParent();
25532
25533 // Vectorize in current basic block only.
25534 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
25535 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
25536 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
25537 R.isDeleted(Op0) || R.isDeleted(Op1))
25538 return false;
25539
25540 // First collect all possible candidates
25542 Candidates.emplace_back(Op0, Op1);
25543
25544 auto *A = dyn_cast<BinaryOperator>(Op0);
25545 auto *B = dyn_cast<BinaryOperator>(Op1);
25546 // Try to skip B.
25547 if (A && B && B->hasOneUse()) {
25548 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
25549 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
25550 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
25551 Candidates.emplace_back(A, B0);
25552 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
25553 Candidates.emplace_back(A, B1);
25554 }
25555 // Try to skip A.
25556 if (B && A && A->hasOneUse()) {
25557 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
25558 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
25559 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
25560 Candidates.emplace_back(A0, B);
25561 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
25562 Candidates.emplace_back(A1, B);
25563 }
25564
25565 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
25567 if (!isReductionCandidate(Inst))
25568 return false;
25569 Type *Ty = Inst->getType();
25570 if (!isValidElementType(Ty) || Ty->isPointerTy())
25571 return false;
25572 HorizontalReduction HorRdx(Inst, Ops);
25573 if (!HorRdx.matchReductionForOperands())
25574 return false;
25575 // Check the cost of operations.
25576 VectorType *VecTy = getWidenedType(Ty, Ops.size());
25578 InstructionCost ScalarCost =
25579 TTI.getScalarizationOverhead(
25580 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
25581 /*Extract=*/true, CostKind) +
25582 TTI.getInstructionCost(Inst, CostKind);
25583 InstructionCost RedCost;
25584 switch (::getRdxKind(Inst)) {
25585 case RecurKind::Add:
25586 case RecurKind::Mul:
25587 case RecurKind::Or:
25588 case RecurKind::And:
25589 case RecurKind::Xor:
25590 case RecurKind::FAdd:
25591 case RecurKind::FMul: {
25592 FastMathFlags FMF;
25593 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
25594 FMF = FPCI->getFastMathFlags();
25595 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25596 CostKind);
25597 break;
25598 }
25599 default:
25600 return false;
25601 }
25602 if (RedCost >= ScalarCost)
25603 return false;
25604
25605 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
25606 };
25607 if (Candidates.size() == 1)
25608 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
25609
25610 // We have multiple options. Try to pick the single best.
25611 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
25612 if (!BestCandidate)
25613 return false;
25614 return (*BestCandidate == 0 &&
25615 TryToReduce(I, {Candidates[*BestCandidate].first,
25616 Candidates[*BestCandidate].second})) ||
25617 tryToVectorizeList({Candidates[*BestCandidate].first,
25618 Candidates[*BestCandidate].second},
25619 R);
25620}
25621
25622bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
25623 BasicBlock *BB, BoUpSLP &R) {
25624 SmallVector<WeakTrackingVH> PostponedInsts;
25625 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
25626 Res |= tryToVectorize(PostponedInsts, R);
25627 return Res;
25628}
25629
25630bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
25631 BoUpSLP &R) {
25632 bool Res = false;
25633 for (Value *V : Insts)
25634 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
25635 Res |= tryToVectorize(Inst, R);
25636 return Res;
25637}
25638
25639bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25640 BasicBlock *BB, BoUpSLP &R,
25641 bool MaxVFOnly) {
25642 if (!R.canMapToVector(IVI->getType()))
25643 return false;
25644
25645 SmallVector<Value *, 16> BuildVectorOpds;
25646 SmallVector<Value *, 16> BuildVectorInsts;
25647 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
25648 return false;
25649
25650 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
25651 R.getORE()->emit([&]() {
25652 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
25653 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
25654 "trying reduction first.";
25655 });
25656 return false;
25657 }
25658 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
25659 // Aggregate value is unlikely to be processed in vector register.
25660 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25661}
25662
25663bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25664 BasicBlock *BB, BoUpSLP &R,
25665 bool MaxVFOnly) {
25666 SmallVector<Value *, 16> BuildVectorInsts;
25667 SmallVector<Value *, 16> BuildVectorOpds;
25668 SmallVector<int> Mask;
25669 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
25671 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
25672 return false;
25673
25674 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
25675 R.getORE()->emit([&]() {
25676 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
25677 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
25678 "trying reduction first.";
25679 });
25680 return false;
25681 }
25682 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
25683 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25684}
25685
25686template <typename T>
25688 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
25689 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
25690 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
25691 bool MaxVFOnly, BoUpSLP &R) {
25692 bool Changed = false;
25693 // Sort by type, parent, operands.
25694 stable_sort(Incoming, Comparator);
25695
25696 // Try to vectorize elements base on their type.
25697 SmallVector<T *> Candidates;
25699 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
25700 VL.clear()) {
25701 // Look for the next elements with the same type, parent and operand
25702 // kinds.
25703 auto *I = dyn_cast<Instruction>(*IncIt);
25704 if (!I || R.isDeleted(I)) {
25705 ++IncIt;
25706 continue;
25707 }
25708 auto *SameTypeIt = IncIt;
25709 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
25710 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25711 AreCompatible(VL, *SameTypeIt))) {
25712 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25713 ++SameTypeIt;
25714 if (I && !R.isDeleted(I))
25715 VL.push_back(cast<T>(I));
25716 }
25717
25718 // Try to vectorize them.
25719 unsigned NumElts = VL.size();
25720 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
25721 << NumElts << ")\n");
25722 // The vectorization is a 3-state attempt:
25723 // 1. Try to vectorize instructions with the same/alternate opcodes with the
25724 // size of maximal register at first.
25725 // 2. Try to vectorize remaining instructions with the same type, if
25726 // possible. This may result in the better vectorization results rather than
25727 // if we try just to vectorize instructions with the same/alternate opcodes.
25728 // 3. Final attempt to try to vectorize all instructions with the
25729 // same/alternate ops only, this may result in some extra final
25730 // vectorization.
25731 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
25732 // Success start over because instructions might have been changed.
25733 Changed = true;
25734 VL.swap(Candidates);
25735 Candidates.clear();
25736 for (T *V : VL) {
25737 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25738 Candidates.push_back(V);
25739 }
25740 } else {
25741 /// \Returns the minimum number of elements that we will attempt to
25742 /// vectorize.
25743 auto GetMinNumElements = [&R](Value *V) {
25744 unsigned EltSize = R.getVectorElementSize(V);
25745 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25746 };
25747 if (NumElts < GetMinNumElements(*IncIt) &&
25748 (Candidates.empty() ||
25749 Candidates.front()->getType() == (*IncIt)->getType())) {
25750 for (T *V : VL) {
25751 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25752 Candidates.push_back(V);
25753 }
25754 }
25755 }
25756 // Final attempt to vectorize instructions with the same types.
25757 if (Candidates.size() > 1 &&
25758 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25759 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
25760 // Success start over because instructions might have been changed.
25761 Changed = true;
25762 } else if (MaxVFOnly) {
25763 // Try to vectorize using small vectors.
25765 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
25766 VL.clear()) {
25767 auto *I = dyn_cast<Instruction>(*It);
25768 if (!I || R.isDeleted(I)) {
25769 ++It;
25770 continue;
25771 }
25772 auto *SameTypeIt = It;
25773 while (SameTypeIt != End &&
25774 (!isa<Instruction>(*SameTypeIt) ||
25775 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25776 AreCompatible(*SameTypeIt, *It))) {
25777 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25778 ++SameTypeIt;
25779 if (I && !R.isDeleted(I))
25780 VL.push_back(cast<T>(I));
25781 }
25782 unsigned NumElts = VL.size();
25783 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
25784 /*MaxVFOnly=*/false))
25785 Changed = true;
25786 It = SameTypeIt;
25787 }
25788 }
25789 Candidates.clear();
25790 }
25791
25792 // Start over at the next instruction of a different type (or the end).
25793 IncIt = SameTypeIt;
25794 }
25795 return Changed;
25796}
25797
25798/// Compare two cmp instructions. If IsCompatibility is true, function returns
25799/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
25800/// operands. If IsCompatibility is false, function implements strict weak
25801/// ordering relation between two cmp instructions, returning true if the first
25802/// instruction is "less" than the second, i.e. its predicate is less than the
25803/// predicate of the second or the operands IDs are less than the operands IDs
25804/// of the second cmp instruction.
25805template <bool IsCompatibility>
25806static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
25807 const DominatorTree &DT) {
25808 assert(isValidElementType(V->getType()) &&
25809 isValidElementType(V2->getType()) &&
25810 "Expected valid element types only.");
25811 if (V == V2)
25812 return IsCompatibility;
25813 auto *CI1 = cast<CmpInst>(V);
25814 auto *CI2 = cast<CmpInst>(V2);
25815 if (CI1->getOperand(0)->getType()->getTypeID() <
25816 CI2->getOperand(0)->getType()->getTypeID())
25817 return !IsCompatibility;
25818 if (CI1->getOperand(0)->getType()->getTypeID() >
25819 CI2->getOperand(0)->getType()->getTypeID())
25820 return false;
25821 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25823 return !IsCompatibility;
25824 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25826 return false;
25827 CmpInst::Predicate Pred1 = CI1->getPredicate();
25828 CmpInst::Predicate Pred2 = CI2->getPredicate();
25831 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
25832 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
25833 if (BasePred1 < BasePred2)
25834 return !IsCompatibility;
25835 if (BasePred1 > BasePred2)
25836 return false;
25837 // Compare operands.
25838 bool CI1Preds = Pred1 == BasePred1;
25839 bool CI2Preds = Pred2 == BasePred1;
25840 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
25841 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
25842 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
25843 if (Op1 == Op2)
25844 continue;
25845 if (Op1->getValueID() < Op2->getValueID())
25846 return !IsCompatibility;
25847 if (Op1->getValueID() > Op2->getValueID())
25848 return false;
25849 if (auto *I1 = dyn_cast<Instruction>(Op1))
25850 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
25851 if (IsCompatibility) {
25852 if (I1->getParent() != I2->getParent())
25853 return false;
25854 } else {
25855 // Try to compare nodes with same parent.
25856 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
25857 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
25858 if (!NodeI1)
25859 return NodeI2 != nullptr;
25860 if (!NodeI2)
25861 return false;
25862 assert((NodeI1 == NodeI2) ==
25863 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
25864 "Different nodes should have different DFS numbers");
25865 if (NodeI1 != NodeI2)
25866 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
25867 }
25868 InstructionsState S = getSameOpcode({I1, I2}, TLI);
25869 if (S && (IsCompatibility || !S.isAltShuffle()))
25870 continue;
25871 if (IsCompatibility)
25872 return false;
25873 if (I1->getOpcode() != I2->getOpcode())
25874 return I1->getOpcode() < I2->getOpcode();
25875 }
25876 }
25877 return IsCompatibility;
25878}
25879
25880template <typename ItT>
25881bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
25882 BasicBlock *BB, BoUpSLP &R) {
25883 bool Changed = false;
25884 // Try to find reductions first.
25885 for (CmpInst *I : CmpInsts) {
25886 if (R.isDeleted(I))
25887 continue;
25888 for (Value *Op : I->operands())
25889 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
25890 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
25891 if (R.isDeleted(I))
25892 break;
25893 }
25894 }
25895 // Try to vectorize operands as vector bundles.
25896 for (CmpInst *I : CmpInsts) {
25897 if (R.isDeleted(I))
25898 continue;
25899 Changed |= tryToVectorize(I, R);
25900 }
25901 // Try to vectorize list of compares.
25902 // Sort by type, compare predicate, etc.
25903 auto CompareSorter = [&](Value *V, Value *V2) {
25904 if (V == V2)
25905 return false;
25906 return compareCmp<false>(V, V2, *TLI, *DT);
25907 };
25908
25909 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
25910 if (VL.empty() || VL.back() == V1)
25911 return true;
25912 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
25913 };
25914
25916 for (Instruction *V : CmpInsts)
25917 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
25918 Vals.push_back(V);
25919 if (Vals.size() <= 1)
25920 return Changed;
25922 Vals, CompareSorter, AreCompatibleCompares,
25923 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
25924 // Exclude possible reductions from other blocks.
25925 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
25926 return any_of(V->users(), [V](User *U) {
25927 auto *Select = dyn_cast<SelectInst>(U);
25928 return Select &&
25929 Select->getParent() != cast<Instruction>(V)->getParent();
25930 });
25931 });
25932 if (ArePossiblyReducedInOtherBlock)
25933 return false;
25934 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25935 },
25936 /*MaxVFOnly=*/true, R);
25937 return Changed;
25938}
25939
25940bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25941 BasicBlock *BB, BoUpSLP &R) {
25943 "This function only accepts Insert instructions");
25944 bool OpsChanged = false;
25945 SmallVector<WeakTrackingVH> PostponedInsts;
25946 for (auto *I : reverse(Instructions)) {
25947 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
25948 if (R.isDeleted(I) || isa<CmpInst>(I))
25949 continue;
25950 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
25951 OpsChanged |=
25952 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
25953 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
25954 OpsChanged |=
25955 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
25956 }
25957 // pass2 - try to vectorize reductions only
25958 if (R.isDeleted(I))
25959 continue;
25960 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
25961 if (R.isDeleted(I) || isa<CmpInst>(I))
25962 continue;
25963 // pass3 - try to match and vectorize a buildvector sequence.
25964 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
25965 OpsChanged |=
25966 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
25967 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
25968 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
25969 /*MaxVFOnly=*/false);
25970 }
25971 }
25972 // Now try to vectorize postponed instructions.
25973 OpsChanged |= tryToVectorize(PostponedInsts, R);
25974
25975 Instructions.clear();
25976 return OpsChanged;
25977}
25978
25979bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
25980 bool Changed = false;
25981 SmallVector<Value *, 4> Incoming;
25982 SmallPtrSet<Value *, 16> VisitedInstrs;
25983 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
25984 // node. Allows better to identify the chains that can be vectorized in the
25985 // better way.
25986 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
25987 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
25989 isValidElementType(V2->getType()) &&
25990 "Expected vectorizable types only.");
25991 if (V1 == V2)
25992 return false;
25993 // It is fine to compare type IDs here, since we expect only vectorizable
25994 // types, like ints, floats and pointers, we don't care about other type.
25995 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
25996 return true;
25997 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
25998 return false;
25999 if (V1->getType()->getScalarSizeInBits() <
26000 V2->getType()->getScalarSizeInBits())
26001 return true;
26002 if (V1->getType()->getScalarSizeInBits() >
26003 V2->getType()->getScalarSizeInBits())
26004 return false;
26005 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26006 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26007 if (Opcodes1.size() < Opcodes2.size())
26008 return true;
26009 if (Opcodes1.size() > Opcodes2.size())
26010 return false;
26011 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26012 {
26013 // Instructions come first.
26014 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
26015 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
26016 if (I1 && I2) {
26017 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
26018 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
26019 if (!NodeI1)
26020 return NodeI2 != nullptr;
26021 if (!NodeI2)
26022 return false;
26023 assert((NodeI1 == NodeI2) ==
26024 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26025 "Different nodes should have different DFS numbers");
26026 if (NodeI1 != NodeI2)
26027 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26028 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
26029 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
26030 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
26031 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
26032 if (!E1 || !E2)
26033 continue;
26034
26035 // Sort on ExtractElementInsts primarily by vector operands. Prefer
26036 // program order of the vector operands.
26037 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
26038 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
26039 if (V1 != V2) {
26040 if (V1 && !V2)
26041 return true;
26042 if (!V1 && V2)
26043 return false;
26045 DT->getNode(V1->getParent());
26047 DT->getNode(V2->getParent());
26048 if (!NodeI1)
26049 return NodeI2 != nullptr;
26050 if (!NodeI2)
26051 return false;
26052 assert((NodeI1 == NodeI2) ==
26053 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26054 "Different nodes should have different DFS numbers");
26055 if (NodeI1 != NodeI2)
26056 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26057 return V1->comesBefore(V2);
26058 }
26059 // If we have the same vector operand, try to sort by constant
26060 // index.
26061 std::optional<unsigned> Id1 = getExtractIndex(E1);
26062 std::optional<unsigned> Id2 = getExtractIndex(E2);
26063 // Bring constants to the top
26064 if (Id1 && !Id2)
26065 return true;
26066 if (!Id1 && Id2)
26067 return false;
26068 // First elements come first.
26069 if (Id1 && Id2)
26070 return *Id1 < *Id2;
26071
26072 continue;
26073 }
26074 if (I1->getOpcode() == I2->getOpcode())
26075 continue;
26076 return I1->getOpcode() < I2->getOpcode();
26077 }
26078 if (I1)
26079 return true;
26080 if (I2)
26081 return false;
26082 }
26083 {
26084 // Non-undef constants come next.
26085 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
26086 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
26087 if (C1 && C2)
26088 continue;
26089 if (C1)
26090 return true;
26091 if (C2)
26092 return false;
26093 }
26094 bool U1 = isa<UndefValue>(Opcodes1[I]);
26095 bool U2 = isa<UndefValue>(Opcodes2[I]);
26096 {
26097 // Non-constant non-instructions come next.
26098 if (!U1 && !U2) {
26099 auto ValID1 = Opcodes1[I]->getValueID();
26100 auto ValID2 = Opcodes2[I]->getValueID();
26101 if (ValID1 == ValID2)
26102 continue;
26103 if (ValID1 < ValID2)
26104 return true;
26105 if (ValID1 > ValID2)
26106 return false;
26107 }
26108 if (!U1)
26109 return true;
26110 if (!U2)
26111 return false;
26112 }
26113 // Undefs come last.
26114 assert(U1 && U2 && "The only thing left should be undef & undef.");
26115 }
26116 return false;
26117 };
26118 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
26119 Value *V1) {
26120 if (VL.empty() || V1 == VL.back())
26121 return true;
26122 Value *V2 = VL.back();
26123 if (V1->getType() != V2->getType())
26124 return false;
26125 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26126 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26127 if (Opcodes1.size() != Opcodes2.size())
26128 return false;
26129 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26130 // Undefs are compatible with any other value.
26131 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
26132 continue;
26133 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
26134 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
26135 if (R.isDeleted(I1) || R.isDeleted(I2))
26136 return false;
26137 if (I1->getParent() != I2->getParent())
26138 return false;
26139 if (getSameOpcode({I1, I2}, *TLI))
26140 continue;
26141 return false;
26142 }
26143 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
26144 continue;
26145 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
26146 return false;
26147 }
26148 return true;
26149 };
26150
26151 bool HaveVectorizedPhiNodes = false;
26152 do {
26153 // Collect the incoming values from the PHIs.
26154 Incoming.clear();
26155 for (Instruction &I : *BB) {
26156 auto *P = dyn_cast<PHINode>(&I);
26157 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
26158 break;
26159
26160 // No need to analyze deleted, vectorized and non-vectorizable
26161 // instructions.
26162 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
26163 isValidElementType(P->getType()))
26164 Incoming.push_back(P);
26165 }
26166
26167 if (Incoming.size() <= 1)
26168 break;
26169
26170 // Find the corresponding non-phi nodes for better matching when trying to
26171 // build the tree.
26172 for (Value *V : Incoming) {
26173 SmallVectorImpl<Value *> &Opcodes =
26174 PHIToOpcodes.try_emplace(V).first->getSecond();
26175 if (!Opcodes.empty())
26176 continue;
26177 SmallVector<Value *, 4> Nodes(1, V);
26178 SmallPtrSet<Value *, 4> Visited;
26179 while (!Nodes.empty()) {
26180 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
26181 if (!Visited.insert(PHI).second)
26182 continue;
26183 for (Value *V : PHI->incoming_values()) {
26184 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
26185 Nodes.push_back(PHI1);
26186 continue;
26187 }
26188 Opcodes.emplace_back(V);
26189 }
26190 }
26191 }
26192
26193 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26194 Incoming, PHICompare, AreCompatiblePHIs,
26195 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26196 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26197 },
26198 /*MaxVFOnly=*/true, R);
26199 Changed |= HaveVectorizedPhiNodes;
26200 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
26201 auto *PHI = dyn_cast<PHINode>(P.first);
26202 return !PHI || R.isDeleted(PHI);
26203 }))
26204 PHIToOpcodes.clear();
26205 VisitedInstrs.insert_range(Incoming);
26206 } while (HaveVectorizedPhiNodes);
26207
26208 VisitedInstrs.clear();
26209
26210 InstSetVector PostProcessInserts;
26211 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26212 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
26213 // also vectorizes `PostProcessCmps`.
26214 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
26215 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26216 if (VectorizeCmps) {
26217 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
26218 PostProcessCmps.clear();
26219 }
26220 PostProcessInserts.clear();
26221 return Changed;
26222 };
26223 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
26224 auto IsInPostProcessInstrs = [&](Instruction *I) {
26225 if (auto *Cmp = dyn_cast<CmpInst>(I))
26226 return PostProcessCmps.contains(Cmp);
26228 PostProcessInserts.contains(I);
26229 };
26230 // Returns true if `I` is an instruction without users, like terminator, or
26231 // function call with ignored return value, store. Ignore unused instructions
26232 // (basing on instruction type, except for CallInst and InvokeInst).
26233 auto HasNoUsers = [](Instruction *I) {
26234 return I->use_empty() &&
26235 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
26236 };
26237 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
26238 // Skip instructions with scalable type. The num of elements is unknown at
26239 // compile-time for scalable type.
26240 if (isa<ScalableVectorType>(It->getType()))
26241 continue;
26242
26243 // Skip instructions marked for the deletion.
26244 if (R.isDeleted(&*It))
26245 continue;
26246 // We may go through BB multiple times so skip the one we have checked.
26247 if (!VisitedInstrs.insert(&*It).second) {
26248 if (HasNoUsers(&*It) &&
26249 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
26250 // We would like to start over since some instructions are deleted
26251 // and the iterator may become invalid value.
26252 Changed = true;
26253 It = BB->begin();
26254 E = BB->end();
26255 }
26256 continue;
26257 }
26258
26259 // Try to vectorize reductions that use PHINodes.
26260 if (PHINode *P = dyn_cast<PHINode>(It)) {
26261 // Check that the PHI is a reduction PHI.
26262 if (P->getNumIncomingValues() == 2) {
26263 // Try to match and vectorize a horizontal reduction.
26264 Instruction *Root = getReductionInstr(DT, P, BB, LI);
26265 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
26266 Changed = true;
26267 It = BB->begin();
26268 E = BB->end();
26269 continue;
26270 }
26271 }
26272 // Try to vectorize the incoming values of the PHI, to catch reductions
26273 // that feed into PHIs.
26274 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
26275 // Skip if the incoming block is the current BB for now. Also, bypass
26276 // unreachable IR for efficiency and to avoid crashing.
26277 // TODO: Collect the skipped incoming values and try to vectorize them
26278 // after processing BB.
26279 if (BB == P->getIncomingBlock(I) ||
26280 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
26281 continue;
26282
26283 // Postponed instructions should not be vectorized here, delay their
26284 // vectorization.
26285 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
26286 PI && !IsInPostProcessInstrs(PI)) {
26287 bool Res =
26288 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
26289 Changed |= Res;
26290 if (Res && R.isDeleted(P)) {
26291 It = BB->begin();
26292 E = BB->end();
26293 break;
26294 }
26295 }
26296 }
26297 continue;
26298 }
26299
26300 if (HasNoUsers(&*It)) {
26301 bool OpsChanged = false;
26302 auto *SI = dyn_cast<StoreInst>(It);
26303 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
26304 if (SI) {
26305 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
26306 // Try to vectorize chain in store, if this is the only store to the
26307 // address in the block.
26308 // TODO: This is just a temporarily solution to save compile time. Need
26309 // to investigate if we can safely turn on slp-vectorize-hor-store
26310 // instead to allow lookup for reduction chains in all non-vectorized
26311 // stores (need to check side effects and compile time).
26312 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
26313 SI->getValueOperand()->hasOneUse();
26314 }
26315 if (TryToVectorizeRoot) {
26316 for (auto *V : It->operand_values()) {
26317 // Postponed instructions should not be vectorized here, delay their
26318 // vectorization.
26319 if (auto *VI = dyn_cast<Instruction>(V);
26320 VI && !IsInPostProcessInstrs(VI))
26321 // Try to match and vectorize a horizontal reduction.
26322 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
26323 }
26324 }
26325 // Start vectorization of post-process list of instructions from the
26326 // top-tree instructions to try to vectorize as many instructions as
26327 // possible.
26328 OpsChanged |=
26329 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
26330 if (OpsChanged) {
26331 // We would like to start over since some instructions are deleted
26332 // and the iterator may become invalid value.
26333 Changed = true;
26334 It = BB->begin();
26335 E = BB->end();
26336 continue;
26337 }
26338 }
26339
26341 PostProcessInserts.insert(&*It);
26342 else if (isa<CmpInst>(It))
26343 PostProcessCmps.insert(cast<CmpInst>(&*It));
26344 }
26345
26346 return Changed;
26347}
26348
26349bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
26350 auto Changed = false;
26351 for (auto &Entry : GEPs) {
26352 // If the getelementptr list has fewer than two elements, there's nothing
26353 // to do.
26354 if (Entry.second.size() < 2)
26355 continue;
26356
26357 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
26358 << Entry.second.size() << ".\n");
26359
26360 // Process the GEP list in chunks suitable for the target's supported
26361 // vector size. If a vector register can't hold 1 element, we are done. We
26362 // are trying to vectorize the index computations, so the maximum number of
26363 // elements is based on the size of the index expression, rather than the
26364 // size of the GEP itself (the target's pointer size).
26365 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
26366 return !R.isDeleted(GEP);
26367 });
26368 if (It == Entry.second.end())
26369 continue;
26370 unsigned MaxVecRegSize = R.getMaxVecRegSize();
26371 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
26372 if (MaxVecRegSize < EltSize)
26373 continue;
26374
26375 unsigned MaxElts = MaxVecRegSize / EltSize;
26376 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
26377 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26378 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
26379
26380 // Initialize a set a candidate getelementptrs. Note that we use a
26381 // SetVector here to preserve program order. If the index computations
26382 // are vectorizable and begin with loads, we want to minimize the chance
26383 // of having to reorder them later.
26384 SetVector<Value *> Candidates(llvm::from_range, GEPList);
26385
26386 // Some of the candidates may have already been vectorized after we
26387 // initially collected them or their index is optimized to constant value.
26388 // If so, they are marked as deleted, so remove them from the set of
26389 // candidates.
26390 Candidates.remove_if([&R](Value *I) {
26391 return R.isDeleted(cast<Instruction>(I)) ||
26392 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
26393 });
26394
26395 // Remove from the set of candidates all pairs of getelementptrs with
26396 // constant differences. Such getelementptrs are likely not good
26397 // candidates for vectorization in a bottom-up phase since one can be
26398 // computed from the other. We also ensure all candidate getelementptr
26399 // indices are unique.
26400 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
26401 auto *GEPI = GEPList[I];
26402 if (!Candidates.count(GEPI))
26403 continue;
26404 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
26405 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
26406 auto *GEPJ = GEPList[J];
26407 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26408 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
26409 Candidates.remove(GEPI);
26410 Candidates.remove(GEPJ);
26411 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26412 Candidates.remove(GEPJ);
26413 }
26414 }
26415 }
26416
26417 // We break out of the above computation as soon as we know there are
26418 // fewer than two candidates remaining.
26419 if (Candidates.size() < 2)
26420 continue;
26421
26422 // Add the single, non-constant index of each candidate to the bundle. We
26423 // ensured the indices met these constraints when we originally collected
26424 // the getelementptrs.
26425 SmallVector<Value *, 16> Bundle(Candidates.size());
26426 auto BundleIndex = 0u;
26427 for (auto *V : Candidates) {
26428 auto *GEP = cast<GetElementPtrInst>(V);
26429 auto *GEPIdx = GEP->idx_begin()->get();
26430 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26431 Bundle[BundleIndex++] = GEPIdx;
26432 }
26433
26434 // Try and vectorize the indices. We are currently only interested in
26435 // gather-like cases of the form:
26436 //
26437 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
26438 //
26439 // where the loads of "a", the loads of "b", and the subtractions can be
26440 // performed in parallel. It's likely that detecting this pattern in a
26441 // bottom-up phase will be simpler and less costly than building a
26442 // full-blown top-down phase beginning at the consecutive loads.
26443 Changed |= tryToVectorizeList(Bundle, R);
26444 }
26445 }
26446 return Changed;
26447}
26448
26449bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
26450 bool Changed = false;
26451 // Sort by type, base pointers and values operand. Value operands must be
26452 // compatible (have the same opcode, same parent), otherwise it is
26453 // definitely not profitable to try to vectorize them.
26454 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
26455 if (V->getValueOperand()->getType()->getTypeID() <
26456 V2->getValueOperand()->getType()->getTypeID())
26457 return true;
26458 if (V->getValueOperand()->getType()->getTypeID() >
26459 V2->getValueOperand()->getType()->getTypeID())
26460 return false;
26461 if (V->getPointerOperandType()->getTypeID() <
26462 V2->getPointerOperandType()->getTypeID())
26463 return true;
26464 if (V->getPointerOperandType()->getTypeID() >
26465 V2->getPointerOperandType()->getTypeID())
26466 return false;
26467 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
26468 V2->getValueOperand()->getType()->getScalarSizeInBits())
26469 return true;
26470 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
26471 V2->getValueOperand()->getType()->getScalarSizeInBits())
26472 return false;
26473 // UndefValues are compatible with all other values.
26474 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
26475 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26476 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26477 DT->getNode(I1->getParent());
26478 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26479 DT->getNode(I2->getParent());
26480 assert(NodeI1 && "Should only process reachable instructions");
26481 assert(NodeI2 && "Should only process reachable instructions");
26482 assert((NodeI1 == NodeI2) ==
26483 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26484 "Different nodes should have different DFS numbers");
26485 if (NodeI1 != NodeI2)
26486 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26487 return I1->getOpcode() < I2->getOpcode();
26488 }
26489 return V->getValueOperand()->getValueID() <
26490 V2->getValueOperand()->getValueID();
26491 };
26492
26493 bool SameParent = true;
26494 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
26495 if (VL.empty()) {
26496 SameParent = true;
26497 return true;
26498 }
26499 StoreInst *V2 = VL.back();
26500 if (V1 == V2)
26501 return true;
26502 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
26503 return false;
26504 if (V1->getPointerOperandType() != V2->getPointerOperandType())
26505 return false;
26506 // Undefs are compatible with any other value.
26507 if (isa<UndefValue>(V1->getValueOperand()) ||
26509 return true;
26510 if (isa<Constant>(V1->getValueOperand()) &&
26512 return true;
26513 // Check if the operands of the stores can be vectorized. They can be
26514 // vectorized, if they have compatible operands or have operands, which can
26515 // be vectorized as copyables.
26516 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
26517 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
26518 if (I1 || I2) {
26519 // Accept only tail-following non-compatible values for now.
26520 // TODO: investigate if it is possible to vectorize incompatible values,
26521 // if the copyables are first in the list.
26522 if (I1 && !I2)
26523 return false;
26524 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
26525 SmallVector<Value *> NewVL(VL.size() + 1);
26526 for (auto [SI, V] : zip(VL, NewVL))
26527 V = SI->getValueOperand();
26528 NewVL.back() = V1->getValueOperand();
26529 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
26530 InstructionsState S = Analysis.buildInstructionsState(
26531 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
26532 /*SkipSameCodeCheck=*/!SameParent);
26533 if (S)
26534 return true;
26535 if (!SameParent)
26536 return false;
26537 }
26538 return V1->getValueOperand()->getValueID() ==
26539 V2->getValueOperand()->getValueID();
26540 };
26541
26542 // Attempt to sort and vectorize each of the store-groups.
26543 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26544 for (auto &Pair : Stores) {
26545 if (Pair.second.size() < 2)
26546 continue;
26547
26548 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
26549 << Pair.second.size() << ".\n");
26550
26551 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
26552 continue;
26553
26554 // Reverse stores to do bottom-to-top analysis. This is important if the
26555 // values are stores to the same addresses several times, in this case need
26556 // to follow the stores order (reversed to meet the memory dependecies).
26557 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
26558 Pair.second.rend());
26560 ReversedStores, StoreSorter, AreCompatibleStores,
26561 [&](ArrayRef<StoreInst *> Candidates, bool) {
26562 return vectorizeStores(Candidates, R, Attempted);
26563 },
26564 /*MaxVFOnly=*/false, R);
26565 }
26566 return Changed;
26567}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
Definition ExpandFp.cpp:992
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void negate()
Negate this APInt in place.
Definition APInt.h:1468
unsigned logBase2() const
Definition APInt.h:1761
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
const T & back() const
back - Get the last element.
Definition ArrayRef.h:156
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:224
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:200
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:162
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:480
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:707
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:666
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:702
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:706
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:829
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:791
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
static bool shouldExecute(unsigned CounterName)
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getUnknown()
Definition DebugLoc.h:162
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:187
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
bool erase(const KeyT &Val)
Definition DenseMap.h:303
unsigned size() const
Definition DenseMap.h:108
bool empty() const
Definition DenseMap.h:107
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:161
iterator end()
Definition DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:205
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:156
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
Implements a dense probed hash-table based set.
Definition DenseSet.h:269
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2571
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2637
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2204
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2593
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2277
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2439
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:141
bool empty() const
Definition MapVector.h:75
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:107
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:99
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
T & front() const
front - Get the first element.
Definition ArrayRef.h:354
iterator end() const
Definition ArrayRef.h:348
iterator begin() const
Definition ArrayRef.h:347
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:90
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:104
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:149
void insert_range(Range &&R)
Definition SetVector.h:193
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:93
void clear()
Completely clear the SetVector.
Definition SetVector.h:284
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:99
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:168
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:269
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:281
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:356
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:226
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
size_type size() const
Definition SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:181
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:284
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
iterator_range< value_op_iterator > operand_values()
Definition User.h:316
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:163
void insert_range(Range &&R)
Definition DenseSet.h:220
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:169
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:174
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isStridedLoad(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, const int64_t Diff, StridedPtrInfo &SPtrInfo) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2038
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1698
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
InstructionCost Cost
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1725
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:738
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2211
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:720
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:682
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:1961
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:314
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:95
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2108
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1948
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1624
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1743
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:431
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:675
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:288
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1399
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1900
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ None
Not a recurrence.
@ Xor
Bitwise or logical XOR of integers.
@ FMul
Product of floats.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1934
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2010
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1815
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1409
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2068
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:836
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:249
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1427
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1436
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const