Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <memory>
99#include <optional>
100#include <set>
101#include <string>
102#include <tuple>
103#include <utility>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107using namespace slpvectorizer;
108using namespace std::placeholders;
109
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
112
113STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
114
115DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
116 "Controls which SLP graphs should be vectorized.");
117
118static cl::opt<bool>
119 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
120 cl::desc("Run the SLP vectorization passes"));
121
122static cl::opt<bool>
123 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
124 cl::desc("Enable vectorization for wider vector utilization"));
125
126static cl::opt<int>
128 cl::desc("Only vectorize if you gain more than this "
129 "number "));
130
132 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
133 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
135
136static cl::opt<bool>
137ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
138 cl::desc("Attempt to vectorize horizontal reductions"));
139
141 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
142 cl::desc(
143 "Attempt to vectorize horizontal reductions feeding into a store"));
144
146 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
147 cl::desc("Improve the code quality by splitting alternate instructions"));
148
149static cl::opt<int>
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
155 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
156
157/// Limits the size of scheduling regions in a block.
158/// It avoid long compile times for _very_ large blocks where vector
159/// instructions are spread over a wide range.
160/// This limit is way higher than needed by real-world functions.
161static cl::opt<int>
162ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
163 cl::desc("Limit the size of the SLP scheduling region per block"));
164
166 "slp-min-reg-size", cl::init(128), cl::Hidden,
167 cl::desc("Attempt to vectorize for this register size in bits"));
168
170 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
171 cl::desc("Limit the recursion depth when building a vectorizable tree"));
172
174 "slp-min-tree-size", cl::init(3), cl::Hidden,
175 cl::desc("Only vectorize small trees if they are fully vectorizable"));
176
177// The maximum depth that the look-ahead score heuristic will explore.
178// The higher this value, the higher the compilation time overhead.
180 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
181 cl::desc("The maximum look-ahead depth for operand reordering scores"));
182
183// The maximum depth that the look-ahead score heuristic will explore
184// when it probing among candidates for vectorization tree roots.
185// The higher this value, the higher the compilation time overhead but unlike
186// similar limit for operands ordering this is less frequently used, hence
187// impact of higher value is less noticeable.
189 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
190 cl::desc("The maximum look-ahead depth for searching best rooting option"));
191
193 "slp-min-strided-loads", cl::init(2), cl::Hidden,
194 cl::desc("The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
196
198 "slp-max-stride", cl::init(8), cl::Hidden,
199 cl::desc("The maximum stride, considered to be profitable."));
200
201static cl::opt<bool>
202 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
203 cl::desc("Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
205
206static cl::opt<bool>
207 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
208 cl::desc("Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
210
211static cl::opt<bool>
212 ViewSLPTree("view-slp-tree", cl::Hidden,
213 cl::desc("Display the SLP trees with Graphviz"));
214
216 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
217 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
218
219/// Enables vectorization of copyable elements.
221 "slp-copyable-elements", cl::init(true), cl::Hidden,
222 cl::desc("Try to replace values with the idempotent instructions for "
223 "better vectorization."));
224
225// Limit the number of alias checks. The limit is chosen so that
226// it has no negative effect on the llvm benchmarks.
227static const unsigned AliasedCheckLimit = 10;
228
229// Limit of the number of uses for potentially transformed instructions/values,
230// used in checks to avoid compile-time explode.
231static constexpr int UsesLimit = 64;
232
233// Another limit for the alias checks: The maximum distance between load/store
234// instructions where alias checks are done.
235// This limit is useful for very large basic blocks.
236static const unsigned MaxMemDepDistance = 160;
237
238/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
239/// regions to be handled.
240static const int MinScheduleRegionSize = 16;
241
242/// Maximum allowed number of operands in the PHI nodes.
243static const unsigned MaxPHINumOperands = 128;
244
245/// Predicate for the element types that the SLP vectorizer supports.
246///
247/// The most important thing to filter here are types which are invalid in LLVM
248/// vectors. We also filter target specific types which have absolutely no
249/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
250/// avoids spending time checking the cost model and realizing that they will
251/// be inevitably scalarized.
252static bool isValidElementType(Type *Ty) {
253 // TODO: Support ScalableVectorType.
255 Ty = Ty->getScalarType();
256 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
257 !Ty->isPPC_FP128Ty();
258}
259
260/// Returns the type of the given value/instruction \p V. If it is store,
261/// returns the type of its value operand, for Cmp - the types of the compare
262/// operands and for insertelement - the type os the inserted operand.
263/// Otherwise, just the type of the value is returned.
265 if (auto *SI = dyn_cast<StoreInst>(V))
266 return SI->getValueOperand()->getType();
267 if (auto *CI = dyn_cast<CmpInst>(V))
268 return CI->getOperand(0)->getType();
269 if (auto *IE = dyn_cast<InsertElementInst>(V))
270 return IE->getOperand(1)->getType();
271 return V->getType();
272}
273
274/// \returns the number of elements for Ty.
275static unsigned getNumElements(Type *Ty) {
277 "ScalableVectorType is not supported.");
278 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
279 return VecTy->getNumElements();
280 return 1;
281}
282
283/// \returns the vector type of ScalarTy based on vectorization factor.
284static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
285 return FixedVectorType::get(ScalarTy->getScalarType(),
286 VF * getNumElements(ScalarTy));
287}
288
289/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
290/// which forms type, which splits by \p TTI into whole vector types during
291/// legalization.
293 Type *Ty, unsigned Sz) {
294 if (!isValidElementType(Ty))
295 return bit_ceil(Sz);
296 // Find the number of elements, which forms full vectors.
297 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
298 if (NumParts == 0 || NumParts >= Sz)
299 return bit_ceil(Sz);
300 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
301}
302
303/// Returns the number of elements of the given type \p Ty, not greater than \p
304/// Sz, which forms type, which splits by \p TTI into whole vector types during
305/// legalization.
306static unsigned
308 unsigned Sz) {
309 if (!isValidElementType(Ty))
310 return bit_floor(Sz);
311 // Find the number of elements, which forms full vectors.
312 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
313 if (NumParts == 0 || NumParts >= Sz)
314 return bit_floor(Sz);
315 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
316 if (RegVF > Sz)
317 return bit_floor(Sz);
318 return (Sz / RegVF) * RegVF;
319}
320
321static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
322 SmallVectorImpl<int> &Mask) {
323 // The ShuffleBuilder implementation use shufflevector to splat an "element".
324 // But the element have different meaning for SLP (scalar) and REVEC
325 // (vector). We need to expand Mask into masks which shufflevector can use
326 // directly.
327 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
328 for (unsigned I : seq<unsigned>(Mask.size()))
329 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
330 I * VecTyNumElements, VecTyNumElements)))
331 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
332 : Mask[I] * VecTyNumElements + J;
333 Mask.swap(NewMask);
334}
335
336/// \returns the number of groups of shufflevector
337/// A group has the following features
338/// 1. All of value in a group are shufflevector.
339/// 2. The mask of all shufflevector is isExtractSubvectorMask.
340/// 3. The mask of all shufflevector uses all of the elements of the source.
341/// e.g., it is 1 group (%0)
342/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
343/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
344/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
345/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
346/// it is 2 groups (%3 and %4)
347/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
348/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
349/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
350/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
351/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
352/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
353/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
354/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
355/// it is 0 group
356/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
357/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
358/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
359/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
361 if (VL.empty())
362 return 0;
364 return 0;
365 auto *SV = cast<ShuffleVectorInst>(VL.front());
366 unsigned SVNumElements =
367 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
370 return 0;
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
373 return 0;
374 unsigned NumGroup = 0;
375 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
376 auto *SV = cast<ShuffleVectorInst>(VL[I]);
377 Value *Src = SV->getOperand(0);
378 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
379 SmallBitVector ExpectedIndex(GroupSize);
380 if (!all_of(Group, [&](Value *V) {
381 auto *SV = cast<ShuffleVectorInst>(V);
382 // From the same source.
383 if (SV->getOperand(0) != Src)
384 return false;
385 int Index;
386 if (!SV->isExtractSubvectorMask(Index))
387 return false;
388 ExpectedIndex.set(Index / ShuffleMaskSize);
389 return true;
390 }))
391 return 0;
392 if (!ExpectedIndex.all())
393 return 0;
394 ++NumGroup;
395 }
396 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
397 return NumGroup;
398}
399
400/// \returns a shufflevector mask which is used to vectorize shufflevectors
401/// e.g.,
402/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
403/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
404/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
405/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
406/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
407/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
409/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
410/// the result is
411/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
413 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
414 auto *SV = cast<ShuffleVectorInst>(VL.front());
415 unsigned SVNumElements =
416 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
417 SmallVector<int> Mask;
418 unsigned AccumulateLength = 0;
419 for (Value *V : VL) {
420 auto *SV = cast<ShuffleVectorInst>(V);
421 for (int M : SV->getShuffleMask())
422 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
425 }
426 return Mask;
427}
428
429/// \returns True if the value is a constant (but not globals/constant
430/// expressions).
431static bool isConstant(Value *V) {
433}
434
435/// Checks if \p V is one of vector-like instructions, i.e. undef,
436/// insertelement/extractelement with constant indices for fixed vector type or
437/// extractvalue instruction.
441 return false;
442 auto *I = dyn_cast<Instruction>(V);
443 if (!I || isa<ExtractValueInst>(I))
444 return true;
445 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
446 return false;
448 return isConstant(I->getOperand(1));
449 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
450 return isConstant(I->getOperand(2));
451}
452
453/// Returns power-of-2 number of elements in a single register (part), given the
454/// total number of elements \p Size and number of registers (parts) \p
455/// NumParts.
456static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
457 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
458}
459
460/// Returns correct remaining number of elements, considering total amount \p
461/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
462/// and current register (part) \p Part.
463static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
464 unsigned Part) {
465 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
466}
467
468#if !defined(NDEBUG)
469/// Print a short descriptor of the instruction bundle suitable for debug output.
470static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
471 std::string Result;
472 raw_string_ostream OS(Result);
473 if (Idx >= 0)
474 OS << "Idx: " << Idx << ", ";
475 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
476 return Result;
477}
478#endif
479
480/// \returns true if all of the instructions in \p VL are in the same block or
481/// false otherwise.
483 auto *It = find_if(VL, IsaPred<Instruction>);
484 if (It == VL.end())
485 return false;
488 return true;
489
490 BasicBlock *BB = I0->getParent();
491 for (Value *V : iterator_range(It, VL.end())) {
492 if (isa<PoisonValue>(V))
493 continue;
494 auto *II = dyn_cast<Instruction>(V);
495 if (!II)
496 return false;
497
498 if (BB != II->getParent())
499 return false;
500 }
501 return true;
502}
503
504/// \returns True if all of the values in \p VL are constants (but not
505/// globals/constant expressions).
507 // Constant expressions and globals can't be vectorized like normal integer/FP
508 // constants.
509 return all_of(VL, isConstant);
510}
511
512/// \returns True if all of the values in \p VL are identical or some of them
513/// are UndefValue.
514static bool isSplat(ArrayRef<Value *> VL) {
515 Value *FirstNonUndef = nullptr;
516 for (Value *V : VL) {
517 if (isa<UndefValue>(V))
518 continue;
519 if (!FirstNonUndef) {
520 FirstNonUndef = V;
521 continue;
522 }
523 if (V != FirstNonUndef)
524 return false;
525 }
526 return FirstNonUndef != nullptr;
527}
528
529/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
530/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
531/// patterns that make it effectively commutative (like equality comparisons
532/// with zero).
533/// In most cases, users should not call this function directly (since \p I and
534/// \p InstWithUses are the same). However, when analyzing interchangeable
535/// instructions, we need to use the converted opcode along with the original
536/// uses.
537/// \param I The instruction to check for commutativity
538/// \param ValWithUses The value whose uses are analyzed for special
539/// patterns
540static bool isCommutative(Instruction *I, Value *ValWithUses) {
541 if (auto *Cmp = dyn_cast<CmpInst>(I))
542 return Cmp->isCommutative();
543 if (auto *BO = dyn_cast<BinaryOperator>(I))
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
546 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
547 all_of(
548 ValWithUses->uses(),
549 [](const Use &U) {
550 // Commutative, if icmp eq/ne sub, 0
551 CmpPredicate Pred;
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
555 return true;
556 // Commutative, if abs(sub nsw, true) or abs(sub, false).
557 ConstantInt *Flag;
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
562 Flag->isOne());
563 })) ||
564 (BO->getOpcode() == Instruction::FSub &&
565 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
566 all_of(ValWithUses->uses(), [](const Use &U) {
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
569 }));
570 return I->isCommutative();
571}
572
573/// This is a helper function to check whether \p I is commutative.
574/// This is a convenience wrapper that calls the two-parameter version of
575/// isCommutative with the same instruction for both parameters. This is
576/// the common case where the instruction being checked for commutativity
577/// is the same as the instruction whose uses are analyzed for special
578/// patterns (see the two-parameter version above for details).
579/// \param I The instruction to check for commutativity
580/// \returns true if the instruction is commutative, false otherwise
581static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
582
583/// \returns number of operands of \p I, considering commutativity. Returns 2
584/// for commutative instrinsics.
585/// \param I The instruction to check for commutativity
588 // IntrinsicInst::isCommutative returns true if swapping the first "two"
589 // arguments to the intrinsic produces the same result.
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
592 }
593 return I->getNumOperands();
594}
595
596template <typename T>
597static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
598 unsigned Offset) {
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
601 "unsupported T");
602 int Index = Offset;
603 if (const auto *IE = dyn_cast<T>(Inst)) {
604 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
605 if (!VT)
606 return std::nullopt;
607 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
608 if (!CI)
609 return std::nullopt;
610 if (CI->getValue().uge(VT->getNumElements()))
611 return std::nullopt;
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
614 return Index;
615 }
616 return std::nullopt;
617}
618
619/// \returns inserting or extracting index of InsertElement, ExtractElement or
620/// InsertValue instruction, using Offset as base offset for index.
621/// \returns std::nullopt if the index is not an immediate.
622static std::optional<unsigned> getElementIndex(const Value *Inst,
623 unsigned Offset = 0) {
624 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
625 return Index;
627 return Index;
628
629 int Index = Offset;
630
631 const auto *IV = dyn_cast<InsertValueInst>(Inst);
632 if (!IV)
633 return std::nullopt;
634
635 Type *CurrentType = IV->getType();
636 for (unsigned I : IV->indices()) {
637 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(I);
640 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
643 } else {
644 return std::nullopt;
645 }
646 Index += I;
647 }
648 return Index;
649}
650
651/// \returns true if all of the values in \p VL use the same opcode.
652/// For comparison instructions, also checks if predicates match.
653/// PoisonValues are considered matching.
654/// Interchangeable instructions are not considered.
656 auto *It = find_if(VL, IsaPred<Instruction>);
657 if (It == VL.end())
658 return true;
659 Instruction *MainOp = cast<Instruction>(*It);
660 unsigned Opcode = MainOp->getOpcode();
661 bool IsCmpOp = isa<CmpInst>(MainOp);
662 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
664 return std::all_of(It, VL.end(), [&](Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
670 });
671}
672
673namespace {
674/// Specifies the way the mask should be analyzed for undefs/poisonous elements
675/// in the shuffle mask.
676enum class UseMask {
677 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
678 ///< check for the mask elements for the first argument (mask
679 ///< indices are in range [0:VF)).
680 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
681 ///< for the mask elements for the second argument (mask indices
682 ///< are in range [VF:2*VF))
683 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
684 ///< future shuffle elements and mark them as ones as being used
685 ///< in future. Non-undef elements are considered as unused since
686 ///< they're already marked as used in the mask.
687};
688} // namespace
689
690/// Prepares a use bitset for the given mask either for the first argument or
691/// for the second.
693 UseMask MaskArg) {
694 SmallBitVector UseMask(VF, true);
695 for (auto [Idx, Value] : enumerate(Mask)) {
696 if (Value == PoisonMaskElem) {
697 if (MaskArg == UseMask::UndefsAsMask)
698 UseMask.reset(Idx);
699 continue;
700 }
701 if (MaskArg == UseMask::FirstArg && Value < VF)
702 UseMask.reset(Value);
703 else if (MaskArg == UseMask::SecondArg && Value >= VF)
704 UseMask.reset(Value - VF);
705 }
706 return UseMask;
707}
708
709/// Checks if the given value is actually an undefined constant vector.
710/// Also, if the \p UseMask is not empty, tries to check if the non-masked
711/// elements actually mask the insertelement buildvector, if any.
712template <bool IsPoisonOnly = false>
714 const SmallBitVector &UseMask = {}) {
715 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
717 if (isa<T>(V))
718 return Res;
719 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
720 if (!VecTy)
721 return Res.reset();
722 auto *C = dyn_cast<Constant>(V);
723 if (!C) {
724 if (!UseMask.empty()) {
725 const Value *Base = V;
726 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
727 Base = II->getOperand(0);
728 if (isa<T>(II->getOperand(1)))
729 continue;
730 std::optional<unsigned> Idx = getElementIndex(II);
731 if (!Idx) {
732 Res.reset();
733 return Res;
734 }
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
736 Res.reset(*Idx);
737 }
738 // TODO: Add analysis for shuffles here too.
739 if (V == Base) {
740 Res.reset();
741 } else {
742 SmallBitVector SubMask(UseMask.size(), false);
743 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
744 }
745 } else {
746 Res.reset();
747 }
748 return Res;
749 }
750 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
751 if (Constant *Elem = C->getAggregateElement(I))
752 if (!isa<T>(Elem) &&
753 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
754 Res.reset(I);
755 }
756 return Res;
757}
758
759/// Checks if the vector of instructions can be represented as a shuffle, like:
760/// %x0 = extractelement <4 x i8> %x, i32 0
761/// %x3 = extractelement <4 x i8> %x, i32 3
762/// %y1 = extractelement <4 x i8> %y, i32 1
763/// %y2 = extractelement <4 x i8> %y, i32 2
764/// %x0x0 = mul i8 %x0, %x0
765/// %x3x3 = mul i8 %x3, %x3
766/// %y1y1 = mul i8 %y1, %y1
767/// %y2y2 = mul i8 %y2, %y2
768/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
769/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
770/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
771/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
772/// ret <4 x i8> %ins4
773/// can be transformed into:
774/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
775/// i32 6>
776/// %2 = mul <4 x i8> %1, %1
777/// ret <4 x i8> %2
778/// Mask will return the Shuffle Mask equivalent to the extracted elements.
779/// TODO: Can we split off and reuse the shuffle mask detection from
780/// ShuffleVectorInst/getShuffleCost?
781static std::optional<TargetTransformInfo::ShuffleKind>
783 AssumptionCache *AC) {
784 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
785 if (It == VL.end())
786 return std::nullopt;
787 unsigned Size =
788 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
790 if (!EI)
791 return S;
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
793 if (!VTy)
794 return S;
795 return std::max(S, VTy->getNumElements());
796 });
797
798 Value *Vec1 = nullptr;
799 Value *Vec2 = nullptr;
800 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
801 auto *EE = dyn_cast<ExtractElementInst>(V);
802 if (!EE)
803 return false;
804 Value *Vec = EE->getVectorOperand();
805 if (isa<UndefValue>(Vec))
806 return false;
807 return isGuaranteedNotToBePoison(Vec, AC);
808 });
809 enum ShuffleMode { Unknown, Select, Permute };
810 ShuffleMode CommonShuffleMode = Unknown;
811 Mask.assign(VL.size(), PoisonMaskElem);
812 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
813 // Undef can be represented as an undef element in a vector.
814 if (isa<UndefValue>(VL[I]))
815 continue;
816 auto *EI = cast<ExtractElementInst>(VL[I]);
817 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
818 return std::nullopt;
819 auto *Vec = EI->getVectorOperand();
820 // We can extractelement from undef or poison vector.
822 continue;
823 // All vector operands must have the same number of vector elements.
824 if (isa<UndefValue>(Vec)) {
825 Mask[I] = I;
826 } else {
827 if (isa<UndefValue>(EI->getIndexOperand()))
828 continue;
829 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
830 if (!Idx)
831 return std::nullopt;
832 // Undefined behavior if Idx is negative or >= Size.
833 if (Idx->getValue().uge(Size))
834 continue;
835 unsigned IntIdx = Idx->getValue().getZExtValue();
836 Mask[I] = IntIdx;
837 }
838 if (isUndefVector(Vec).all() && HasNonUndefVec)
839 continue;
840 // For correct shuffling we have to have at most 2 different vector operands
841 // in all extractelement instructions.
842 if (!Vec1 || Vec1 == Vec) {
843 Vec1 = Vec;
844 } else if (!Vec2 || Vec2 == Vec) {
845 Vec2 = Vec;
846 Mask[I] += Size;
847 } else {
848 return std::nullopt;
849 }
850 if (CommonShuffleMode == Permute)
851 continue;
852 // If the extract index is not the same as the operation number, it is a
853 // permutation.
854 if (Mask[I] % Size != I) {
855 CommonShuffleMode = Permute;
856 continue;
857 }
858 CommonShuffleMode = Select;
859 }
860 // If we're not crossing lanes in different vectors, consider it as blending.
861 if (CommonShuffleMode == Select && Vec2)
863 // If Vec2 was never used, we have a permutation of a single vector, otherwise
864 // we have permutation of 2 vectors.
867}
868
869/// \returns True if Extract{Value,Element} instruction extracts element Idx.
870static std::optional<unsigned> getExtractIndex(const Instruction *E) {
871 unsigned Opcode = E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
876 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
877 if (!CI)
878 return std::nullopt;
879 return CI->getZExtValue();
880 }
881 auto *EI = cast<ExtractValueInst>(E);
882 if (EI->getNumIndices() != 1)
883 return std::nullopt;
884 return *EI->idx_begin();
885}
886
887namespace llvm {
888/// Checks if the provided value does not require scheduling. It does not
889/// require scheduling if this is not an instruction or it is an instruction
890/// that does not read/write memory and all operands are either not instructions
891/// or phi nodes or instructions from different blocks.
892static bool areAllOperandsNonInsts(Value *V);
893/// Checks if the provided value does not require scheduling. It does not
894/// require scheduling if this is not an instruction or it is an instruction
895/// that does not read/write memory and all users are phi nodes or instructions
896/// from the different blocks.
897static bool isUsedOutsideBlock(Value *V);
898/// Checks if the specified value does not require scheduling. It does not
899/// require scheduling if all operands and all users do not need to be scheduled
900/// in the current basic block.
901static bool doesNotNeedToBeScheduled(Value *V);
902} // namespace llvm
903
904namespace {
905/// \returns true if \p Opcode is allowed as part of the main/alternate
906/// instruction for SLP vectorization.
907///
908/// Example of unsupported opcode is SDIV that can potentially cause UB if the
909/// "shuffled out" lane would result in division by zero.
910bool isValidForAlternation(unsigned Opcode) {
911 return !Instruction::isIntDivRem(Opcode);
912}
913
914/// Helper class that determines VL can use the same opcode.
915/// Alternate instruction is supported. In addition, it supports interchangeable
916/// instruction. An interchangeable instruction is an instruction that can be
917/// converted to another instruction with same semantics. For example, x << 1 is
918/// equal to x * 2. x * 1 is equal to x | 0.
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
921 /// Sort SupportedOp because it is used by binary_search.
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
925 enum : MaskType {
926 ShlBIT = 0b1,
927 AShrBIT = 0b10,
928 MulBIT = 0b100,
929 AddBIT = 0b1000,
930 SubBIT = 0b10000,
931 AndBIT = 0b100000,
932 OrBIT = 0b1000000,
933 XorBIT = 0b10000000,
934 MainOpBIT = 0b100000000,
936 };
937 /// Return a non-nullptr if either operand of I is a ConstantInt.
938 /// The second return value represents the operand position. We check the
939 /// right-hand side first (1). If the right hand side is not a ConstantInt and
940 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
941 /// side (0).
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(const Instruction *I) {
944 unsigned Opcode = I->getOpcode();
945 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
946 (void)SupportedOp;
947 auto *BinOp = cast<BinaryOperator>(I);
948 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
949 return {CI, 1};
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
952 return {nullptr, 0};
953 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
954 return {CI, 0};
955 return {nullptr, 0};
956 }
957 struct InterchangeableInfo {
958 const Instruction *I = nullptr;
959 /// The bit it sets represents whether MainOp can be converted to.
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
962 /// We cannot create an interchangeable instruction that does not exist in
963 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
964 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
965 /// 1]. SeenBefore is used to know what operations have been seen before.
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(const Instruction *I) : I(I) {}
968 /// Return false allows BinOpSameOpcodeHelper to find an alternate
969 /// instruction. Directly setting the mask will destroy the mask state,
970 /// preventing us from determining which instruction it should convert to.
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
975 return true;
976 }
977 return false;
978 }
979 bool equal(unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
981 }
982 unsigned getOpcode() const {
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1002 llvm_unreachable("Cannot find interchangeable instruction.");
1003 }
1004
1005 /// Return true if the instruction can be converted to \p Opcode.
1006 bool hasCandidateOpcode(unsigned Opcode) const {
1007 MaskType Candidate = Mask & SeenBefore;
1008 switch (Opcode) {
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1035 return false;
1036 default:
1037 break;
1038 }
1039 llvm_unreachable("Cannot find interchangeable instruction.");
1040 }
1041
1042 SmallVector<Value *> getOperand(const Instruction *To) const {
1043 unsigned ToOpcode = To->getOpcode();
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1046 return SmallVector<Value *>(I->operands());
1047 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1051 APInt ToCIValue;
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1055 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1056 FromCIValue.getZExtValue());
1057 } else {
1058 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1060 ? APInt::getAllOnes(FromCIValueBitWidth)
1061 : APInt::getZero(FromCIValueBitWidth);
1062 }
1063 break;
1064 case Instruction::Mul:
1065 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1068 } else {
1069 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1071 ? APInt::getAllOnes(FromCIValueBitWidth)
1072 : APInt::getZero(FromCIValueBitWidth);
1073 }
1074 break;
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.isZero()) {
1078 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1079 } else {
1080 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1083 ToCIValue.negate();
1084 }
1085 break;
1086 case Instruction::And:
1087 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1088 ToCIValue = ToOpcode == Instruction::Mul
1089 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1090 : APInt::getZero(FromCIValueBitWidth);
1091 break;
1092 default:
1093 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1094 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1095 break;
1096 }
1097 Value *LHS = I->getOperand(1 - Pos);
1098 Constant *RHS =
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1100 // constant + x cannot be -constant - x
1101 // instead, it should be x - -constant
1102 if (Pos == 1 ||
1103 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1104 FromOpcode == Instruction::Xor) &&
1105 ToOpcode == Instruction::Sub))
1106 return SmallVector<Value *>({LHS, RHS});
1107 return SmallVector<Value *>({RHS, LHS});
1108 }
1109 };
1110 InterchangeableInfo MainOp;
1111 InterchangeableInfo AltOp;
1112 bool isValidForAlternation(const Instruction *I) const {
1113 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1114 ::isValidForAlternation(I->getOpcode());
1115 }
1116 bool initializeAltOp(const Instruction *I) {
1117 if (AltOp.I)
1118 return true;
1119 if (!isValidForAlternation(I))
1120 return false;
1121 AltOp.I = I;
1122 return true;
1123 }
1124
1125public:
1126 BinOpSameOpcodeHelper(const Instruction *MainOp,
1127 const Instruction *AltOp = nullptr)
1128 : MainOp(MainOp), AltOp(AltOp) {
1129 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1130 }
1131 bool add(const Instruction *I) {
1133 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1134 unsigned Opcode = I->getOpcode();
1135 MaskType OpcodeInMaskForm;
1136 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1137 switch (Opcode) {
1138 case Instruction::Shl:
1139 OpcodeInMaskForm = ShlBIT;
1140 break;
1141 case Instruction::AShr:
1142 OpcodeInMaskForm = AShrBIT;
1143 break;
1144 case Instruction::Mul:
1145 OpcodeInMaskForm = MulBIT;
1146 break;
1147 case Instruction::Add:
1148 OpcodeInMaskForm = AddBIT;
1149 break;
1150 case Instruction::Sub:
1151 OpcodeInMaskForm = SubBIT;
1152 break;
1153 case Instruction::And:
1154 OpcodeInMaskForm = AndBIT;
1155 break;
1156 case Instruction::Or:
1157 OpcodeInMaskForm = OrBIT;
1158 break;
1159 case Instruction::Xor:
1160 OpcodeInMaskForm = XorBIT;
1161 break;
1162 default:
1163 return MainOp.equal(Opcode) ||
1164 (initializeAltOp(I) && AltOp.equal(Opcode));
1165 }
1166 MaskType InterchangeableMask = OpcodeInMaskForm;
1167 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1168 if (CI) {
1169 constexpr MaskType CanBeAll =
1170 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1171 const APInt &CIValue = CI->getValue();
1172 switch (Opcode) {
1173 case Instruction::Shl:
1174 if (CIValue.ult(CIValue.getBitWidth()))
1175 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1176 break;
1177 case Instruction::Mul:
1178 if (CIValue.isOne()) {
1179 InterchangeableMask = CanBeAll;
1180 break;
1181 }
1182 if (CIValue.isPowerOf2())
1183 InterchangeableMask = MulBIT | ShlBIT;
1184 break;
1185 case Instruction::Add:
1186 case Instruction::Sub:
1187 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1188 break;
1189 case Instruction::And:
1190 if (CIValue.isAllOnes())
1191 InterchangeableMask = CanBeAll;
1192 break;
1193 case Instruction::Xor:
1194 if (CIValue.isZero())
1195 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1196 break;
1197 default:
1198 if (CIValue.isZero())
1199 InterchangeableMask = CanBeAll;
1200 break;
1201 }
1202 }
1203 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1204 (initializeAltOp(I) &&
1205 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1206 }
1207 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1208 /// Checks if the list of potential opcodes includes \p Opcode.
1209 bool hasCandidateOpcode(unsigned Opcode) const {
1210 return MainOp.hasCandidateOpcode(Opcode);
1211 }
1212 bool hasAltOp() const { return AltOp.I; }
1213 unsigned getAltOpcode() const {
1214 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1215 }
1216 SmallVector<Value *> getOperand(const Instruction *I) const {
1217 return MainOp.getOperand(I);
1218 }
1219};
1220
1221/// Main data required for vectorization of instructions.
1222class InstructionsState {
1223 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1224 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1225 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1226 /// isAltShuffle).
1227 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1228 /// from getMainAltOpsNoStateVL.
1229 /// For those InstructionsState that use alternate instructions, the resulting
1230 /// vectorized output ultimately comes from a shufflevector. For example,
1231 /// given a vector list (VL):
1232 /// VL[0] = add i32 a, e
1233 /// VL[1] = sub i32 b, f
1234 /// VL[2] = add i32 c, g
1235 /// VL[3] = sub i32 d, h
1236 /// The vectorized result would be:
1237 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1238 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1239 /// result = shufflevector <4 x i32> intermediated_0,
1240 /// <4 x i32> intermediated_1,
1241 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1242 /// Since shufflevector is used in the final result, when calculating the cost
1243 /// (getEntryCost), we must account for the usage of shufflevector in
1244 /// GetVectorCost.
1245 Instruction *MainOp = nullptr;
1246 Instruction *AltOp = nullptr;
1247 /// Wether the instruction state represents copyable instructions.
1248 bool HasCopyables = false;
1249
1250public:
1251 Instruction *getMainOp() const {
1252 assert(valid() && "InstructionsState is invalid.");
1253 return MainOp;
1254 }
1255
1256 Instruction *getAltOp() const {
1257 assert(valid() && "InstructionsState is invalid.");
1258 return AltOp;
1259 }
1260
1261 /// The main/alternate opcodes for the list of instructions.
1262 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1263
1264 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1265
1266 /// Some of the instructions in the list have alternate opcodes.
1267 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1268
1269 /// Checks if the instruction matches either the main or alternate opcode.
1270 /// \returns
1271 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1272 /// to it
1273 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1274 /// it
1275 /// - nullptr if \param I cannot be matched or converted to either opcode
1276 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1277 assert(MainOp && "MainOp cannot be nullptr.");
1278 if (I->getOpcode() == MainOp->getOpcode())
1279 return MainOp;
1280 // Prefer AltOp instead of interchangeable instruction of MainOp.
1281 assert(AltOp && "AltOp cannot be nullptr.");
1282 if (I->getOpcode() == AltOp->getOpcode())
1283 return AltOp;
1284 if (!I->isBinaryOp())
1285 return nullptr;
1286 BinOpSameOpcodeHelper Converter(MainOp);
1287 if (!Converter.add(I) || !Converter.add(MainOp))
1288 return nullptr;
1289 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1290 BinOpSameOpcodeHelper AltConverter(AltOp);
1291 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1292 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1293 return AltOp;
1294 }
1295 if (Converter.hasAltOp() && !isAltShuffle())
1296 return nullptr;
1297 return Converter.hasAltOp() ? AltOp : MainOp;
1298 }
1299
1300 /// Checks if main/alt instructions are shift operations.
1301 bool isShiftOp() const {
1302 return getMainOp()->isShift() && getAltOp()->isShift();
1303 }
1304
1305 /// Checks if main/alt instructions are bitwise logic operations.
1306 bool isBitwiseLogicOp() const {
1307 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1308 }
1309
1310 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1311 bool isMulDivLikeOp() const {
1312 constexpr std::array<unsigned, 8> MulDiv = {
1313 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1314 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1315 Instruction::URem, Instruction::FRem};
1316 return is_contained(MulDiv, getOpcode()) &&
1317 is_contained(MulDiv, getAltOpcode());
1318 }
1319
1320 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1321 bool isAddSubLikeOp() const {
1322 constexpr std::array<unsigned, 4> AddSub = {
1323 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1324 Instruction::FSub};
1325 return is_contained(AddSub, getOpcode()) &&
1326 is_contained(AddSub, getAltOpcode());
1327 }
1328
1329 /// Checks if main/alt instructions are cmp operations.
1330 bool isCmpOp() const {
1331 return (getOpcode() == Instruction::ICmp ||
1332 getOpcode() == Instruction::FCmp) &&
1333 getAltOpcode() == getOpcode();
1334 }
1335
1336 /// Checks if the current state is valid, i.e. has non-null MainOp
1337 bool valid() const { return MainOp && AltOp; }
1338
1339 explicit operator bool() const { return valid(); }
1340
1341 InstructionsState() = delete;
1342 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1343 bool HasCopyables = false)
1344 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1345 static InstructionsState invalid() { return {nullptr, nullptr}; }
1346
1347 /// Checks if the value is a copyable element.
1348 bool isCopyableElement(Value *V) const {
1349 assert(valid() && "InstructionsState is invalid.");
1350 if (!HasCopyables)
1351 return false;
1352 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1353 return false;
1354 auto *I = dyn_cast<Instruction>(V);
1355 if (!I)
1356 return !isa<PoisonValue>(V);
1357 if (I->getParent() != MainOp->getParent() &&
1360 return true;
1361 if (I->getOpcode() == MainOp->getOpcode())
1362 return false;
1363 if (!I->isBinaryOp())
1364 return true;
1365 BinOpSameOpcodeHelper Converter(MainOp);
1366 return !Converter.add(I) || !Converter.add(MainOp) ||
1367 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1368 }
1369
1370 /// Checks if the value is non-schedulable.
1371 bool isNonSchedulable(Value *V) const {
1372 assert(valid() && "InstructionsState is invalid.");
1373 auto *I = dyn_cast<Instruction>(V);
1374 if (!HasCopyables)
1375 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1377 // MainOp for copyables always schedulable to correctly identify
1378 // non-schedulable copyables.
1379 if (getMainOp() == V)
1380 return false;
1381 if (isCopyableElement(V)) {
1382 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1383 auto *I = dyn_cast<Instruction>(V);
1384 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1386 // If the copyable instructions comes after MainOp
1387 // (non-schedulable, but used in the block) - cannot vectorize
1388 // it, will possibly generate use before def.
1389 !MainOp->comesBefore(I));
1390 };
1391
1392 return IsNonSchedulableCopyableElement(V);
1393 }
1394 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1396 }
1397
1398 /// Checks if the state represents copyable instructions.
1399 bool areInstructionsWithCopyableElements() const {
1400 assert(valid() && "InstructionsState is invalid.");
1401 return HasCopyables;
1402 }
1403};
1404
1405std::pair<Instruction *, SmallVector<Value *>>
1406convertTo(Instruction *I, const InstructionsState &S) {
1407 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1408 assert(SelectedOp && "Cannot convert the instruction.");
1409 if (I->isBinaryOp()) {
1410 BinOpSameOpcodeHelper Converter(I);
1411 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1412 }
1413 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1414}
1415
1416} // end anonymous namespace
1417
1418static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1419 const TargetLibraryInfo &TLI);
1420
1421/// Find an instruction with a specific opcode in VL.
1422/// \param VL Array of values to search through. Must contain only Instructions
1423/// and PoisonValues.
1424/// \param Opcode The instruction opcode to search for
1425/// \returns
1426/// - The first instruction found with matching opcode
1427/// - nullptr if no matching instruction is found
1429 unsigned Opcode) {
1430 for (Value *V : VL) {
1431 if (isa<PoisonValue>(V))
1432 continue;
1433 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1434 auto *Inst = cast<Instruction>(V);
1435 if (Inst->getOpcode() == Opcode)
1436 return Inst;
1437 }
1438 return nullptr;
1439}
1440
1441/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1442/// compatible instructions or constants, or just some other regular values.
1443static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1444 Value *Op1, const TargetLibraryInfo &TLI) {
1445 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1446 (isConstant(BaseOp1) && isConstant(Op1)) ||
1447 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1448 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1449 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1450 getSameOpcode({BaseOp0, Op0}, TLI) ||
1451 getSameOpcode({BaseOp1, Op1}, TLI);
1452}
1453
1454/// \returns true if a compare instruction \p CI has similar "look" and
1455/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1456/// swapped, false otherwise.
1457static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1458 const TargetLibraryInfo &TLI) {
1459 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1460 "Assessing comparisons of different types?");
1461 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1462 CmpInst::Predicate Pred = CI->getPredicate();
1464
1465 Value *BaseOp0 = BaseCI->getOperand(0);
1466 Value *BaseOp1 = BaseCI->getOperand(1);
1467 Value *Op0 = CI->getOperand(0);
1468 Value *Op1 = CI->getOperand(1);
1469
1470 return (BasePred == Pred &&
1471 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1472 (BasePred == SwappedPred &&
1473 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1474}
1475
1476/// \returns analysis of the Instructions in \p VL described in
1477/// InstructionsState, the Opcode that we suppose the whole list
1478/// could be vectorized even if its structure is diverse.
1479static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1480 const TargetLibraryInfo &TLI) {
1481 // Make sure these are all Instructions.
1483 return InstructionsState::invalid();
1484
1485 auto *It = find_if(VL, IsaPred<Instruction>);
1486 if (It == VL.end())
1487 return InstructionsState::invalid();
1488
1489 Instruction *MainOp = cast<Instruction>(*It);
1490 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1491 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1492 (VL.size() == 2 && InstCnt < 2))
1493 return InstructionsState::invalid();
1494
1495 bool IsCastOp = isa<CastInst>(MainOp);
1496 bool IsBinOp = isa<BinaryOperator>(MainOp);
1497 bool IsCmpOp = isa<CmpInst>(MainOp);
1498 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1500 Instruction *AltOp = MainOp;
1501 unsigned Opcode = MainOp->getOpcode();
1502 unsigned AltOpcode = Opcode;
1503
1504 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1505 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1506 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1507 UniquePreds.insert(BasePred);
1508 UniqueNonSwappedPreds.insert(BasePred);
1509 for (Value *V : VL) {
1510 auto *I = dyn_cast<CmpInst>(V);
1511 if (!I)
1512 return false;
1513 CmpInst::Predicate CurrentPred = I->getPredicate();
1514 CmpInst::Predicate SwappedCurrentPred =
1515 CmpInst::getSwappedPredicate(CurrentPred);
1516 UniqueNonSwappedPreds.insert(CurrentPred);
1517 if (!UniquePreds.contains(CurrentPred) &&
1518 !UniquePreds.contains(SwappedCurrentPred))
1519 UniquePreds.insert(CurrentPred);
1520 }
1521 // Total number of predicates > 2, but if consider swapped predicates
1522 // compatible only 2, consider swappable predicates as compatible opcodes,
1523 // not alternate.
1524 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1525 }();
1526 // Check for one alternate opcode from another BinaryOperator.
1527 // TODO - generalize to support all operators (types, calls etc.).
1528 Intrinsic::ID BaseID = 0;
1529 SmallVector<VFInfo> BaseMappings;
1530 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1531 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1532 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1533 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1534 return InstructionsState::invalid();
1535 }
1536 bool AnyPoison = InstCnt != VL.size();
1537 // Check MainOp too to be sure that it matches the requirements for the
1538 // instructions.
1539 for (Value *V : iterator_range(It, VL.end())) {
1540 auto *I = dyn_cast<Instruction>(V);
1541 if (!I)
1542 continue;
1543
1544 // Cannot combine poison and divisions.
1545 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1546 // intrinsics/functions only.
1547 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1548 return InstructionsState::invalid();
1549 unsigned InstOpcode = I->getOpcode();
1550 if (IsBinOp && isa<BinaryOperator>(I)) {
1551 if (BinOpHelper.add(I))
1552 continue;
1553 } else if (IsCastOp && isa<CastInst>(I)) {
1554 Value *Op0 = MainOp->getOperand(0);
1555 Type *Ty0 = Op0->getType();
1556 Value *Op1 = I->getOperand(0);
1557 Type *Ty1 = Op1->getType();
1558 if (Ty0 == Ty1) {
1559 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1560 continue;
1561 if (Opcode == AltOpcode) {
1562 assert(isValidForAlternation(Opcode) &&
1563 isValidForAlternation(InstOpcode) &&
1564 "Cast isn't safe for alternation, logic needs to be updated!");
1565 AltOpcode = InstOpcode;
1566 AltOp = I;
1567 continue;
1568 }
1569 }
1570 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1571 auto *BaseInst = cast<CmpInst>(MainOp);
1572 Type *Ty0 = BaseInst->getOperand(0)->getType();
1573 Type *Ty1 = Inst->getOperand(0)->getType();
1574 if (Ty0 == Ty1) {
1575 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1576 assert(InstOpcode == AltOpcode &&
1577 "Alternate instructions are only supported by BinaryOperator "
1578 "and CastInst.");
1579 // Check for compatible operands. If the corresponding operands are not
1580 // compatible - need to perform alternate vectorization.
1581 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1582 CmpInst::Predicate SwappedCurrentPred =
1583 CmpInst::getSwappedPredicate(CurrentPred);
1584
1585 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1586 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1587 continue;
1588
1589 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1590 continue;
1591 auto *AltInst = cast<CmpInst>(AltOp);
1592 if (MainOp != AltOp) {
1593 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1594 continue;
1595 } else if (BasePred != CurrentPred) {
1596 assert(
1597 isValidForAlternation(InstOpcode) &&
1598 "CmpInst isn't safe for alternation, logic needs to be updated!");
1599 AltOp = I;
1600 continue;
1601 }
1602 CmpInst::Predicate AltPred = AltInst->getPredicate();
1603 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1604 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1605 continue;
1606 }
1607 } else if (InstOpcode == Opcode) {
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator and "
1610 "CastInst.");
1611 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1612 if (Gep->getNumOperands() != 2 ||
1613 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1614 return InstructionsState::invalid();
1615 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1617 return InstructionsState::invalid();
1618 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1619 auto *BaseLI = cast<LoadInst>(MainOp);
1620 if (!LI->isSimple() || !BaseLI->isSimple())
1621 return InstructionsState::invalid();
1622 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1623 auto *CallBase = cast<CallInst>(MainOp);
1624 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1625 return InstructionsState::invalid();
1626 if (Call->hasOperandBundles() &&
1628 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1629 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1630 CallBase->op_begin() +
1632 return InstructionsState::invalid();
1634 if (ID != BaseID)
1635 return InstructionsState::invalid();
1636 if (!ID) {
1637 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1638 if (Mappings.size() != BaseMappings.size() ||
1639 Mappings.front().ISA != BaseMappings.front().ISA ||
1640 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1641 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1642 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1643 Mappings.front().Shape.Parameters !=
1644 BaseMappings.front().Shape.Parameters)
1645 return InstructionsState::invalid();
1646 }
1647 }
1648 continue;
1649 }
1650 return InstructionsState::invalid();
1651 }
1652
1653 if (IsBinOp) {
1654 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1655 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1656 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1657 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1658 }
1659 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1660 "Incorrect implementation of allSameOpcode.");
1661 InstructionsState S(MainOp, AltOp);
1662 assert(all_of(VL,
1663 [&](Value *V) {
1664 return isa<PoisonValue>(V) ||
1665 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1666 }) &&
1667 "Invalid InstructionsState.");
1668 return S;
1669}
1670
1671/// \returns true if all of the values in \p VL have the same type or false
1672/// otherwise.
1674 Type *Ty = VL.consume_front()->getType();
1675 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1676}
1677
1678/// \returns True if in-tree use also needs extract. This refers to
1679/// possible scalar operand in vectorized instruction.
1680static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1681 TargetLibraryInfo *TLI,
1682 const TargetTransformInfo *TTI) {
1683 if (!UserInst)
1684 return false;
1685 unsigned Opcode = UserInst->getOpcode();
1686 switch (Opcode) {
1687 case Instruction::Load: {
1688 LoadInst *LI = cast<LoadInst>(UserInst);
1689 return (LI->getPointerOperand() == Scalar);
1690 }
1691 case Instruction::Store: {
1692 StoreInst *SI = cast<StoreInst>(UserInst);
1693 return (SI->getPointerOperand() == Scalar);
1694 }
1695 case Instruction::Call: {
1696 CallInst *CI = cast<CallInst>(UserInst);
1698 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1699 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1700 Arg.value().get() == Scalar;
1701 });
1702 }
1703 default:
1704 return false;
1705 }
1706}
1707
1708/// \returns the AA location that is being access by the instruction.
1711 return MemoryLocation::get(SI);
1712 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1713 return MemoryLocation::get(LI);
1714 return MemoryLocation();
1715}
1716
1717/// \returns True if the instruction is not a volatile or atomic load/store.
1718static bool isSimple(Instruction *I) {
1719 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1720 return LI->isSimple();
1722 return SI->isSimple();
1724 return !MI->isVolatile();
1725 return true;
1726}
1727
1728/// Shuffles \p Mask in accordance with the given \p SubMask.
1729/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1730/// one but two input vectors.
1731static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1732 bool ExtendingManyInputs = false) {
1733 if (SubMask.empty())
1734 return;
1735 assert(
1736 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1737 // Check if input scalars were extended to match the size of other node.
1738 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1739 "SubMask with many inputs support must be larger than the mask.");
1740 if (Mask.empty()) {
1741 Mask.append(SubMask.begin(), SubMask.end());
1742 return;
1743 }
1744 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1745 int TermValue = std::min(Mask.size(), SubMask.size());
1746 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1747 if (SubMask[I] == PoisonMaskElem ||
1748 (!ExtendingManyInputs &&
1749 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1750 continue;
1751 NewMask[I] = Mask[SubMask[I]];
1752 }
1753 Mask.swap(NewMask);
1754}
1755
1756/// Order may have elements assigned special value (size) which is out of
1757/// bounds. Such indices only appear on places which correspond to undef values
1758/// (see canReuseExtract for details) and used in order to avoid undef values
1759/// have effect on operands ordering.
1760/// The first loop below simply finds all unused indices and then the next loop
1761/// nest assigns these indices for undef values positions.
1762/// As an example below Order has two undef positions and they have assigned
1763/// values 3 and 7 respectively:
1764/// before: 6 9 5 4 9 2 1 0
1765/// after: 6 3 5 4 7 2 1 0
1767 const size_t Sz = Order.size();
1768 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1769 SmallBitVector MaskedIndices(Sz);
1770 for (unsigned I = 0; I < Sz; ++I) {
1771 if (Order[I] < Sz)
1772 UnusedIndices.reset(Order[I]);
1773 else
1774 MaskedIndices.set(I);
1775 }
1776 if (MaskedIndices.none())
1777 return;
1778 assert(UnusedIndices.count() == MaskedIndices.count() &&
1779 "Non-synced masked/available indices.");
1780 int Idx = UnusedIndices.find_first();
1781 int MIdx = MaskedIndices.find_first();
1782 while (MIdx >= 0) {
1783 assert(Idx >= 0 && "Indices must be synced.");
1784 Order[MIdx] = Idx;
1785 Idx = UnusedIndices.find_next(Idx);
1786 MIdx = MaskedIndices.find_next(MIdx);
1787 }
1788}
1789
1790/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1791/// Opcode1.
1793 unsigned Opcode0, unsigned Opcode1) {
1794 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1795 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1796 for (unsigned Lane : seq<unsigned>(VL.size())) {
1797 if (isa<PoisonValue>(VL[Lane]))
1798 continue;
1799 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1800 OpcodeMask.set(Lane * ScalarTyNumElements,
1801 Lane * ScalarTyNumElements + ScalarTyNumElements);
1802 }
1803 return OpcodeMask;
1804}
1805
1806/// Replicates the given \p Val \p VF times.
1808 unsigned VF) {
1809 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1810 "Expected scalar constants.");
1811 SmallVector<Constant *> NewVal(Val.size() * VF);
1812 for (auto [I, V] : enumerate(Val))
1813 std::fill_n(NewVal.begin() + I * VF, VF, V);
1814 return NewVal;
1815}
1816
1817namespace llvm {
1818
1820 SmallVectorImpl<int> &Mask) {
1821 Mask.clear();
1822 const unsigned E = Indices.size();
1823 Mask.resize(E, PoisonMaskElem);
1824 for (unsigned I = 0; I < E; ++I)
1825 Mask[Indices[I]] = I;
1826}
1827
1828/// Reorders the list of scalars in accordance with the given \p Mask.
1830 ArrayRef<int> Mask) {
1831 assert(!Mask.empty() && "Expected non-empty mask.");
1832 SmallVector<Value *> Prev(Scalars.size(),
1833 PoisonValue::get(Scalars.front()->getType()));
1834 Prev.swap(Scalars);
1835 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1836 if (Mask[I] != PoisonMaskElem)
1837 Scalars[Mask[I]] = Prev[I];
1838}
1839
1840/// Checks if the provided value does not require scheduling. It does not
1841/// require scheduling if this is not an instruction or it is an instruction
1842/// that does not read/write memory and all operands are either not instructions
1843/// or phi nodes or instructions from different blocks.
1845 auto *I = dyn_cast<Instruction>(V);
1846 if (!I)
1847 return true;
1848 return !mayHaveNonDefUseDependency(*I) &&
1849 all_of(I->operands(), [I](Value *V) {
1850 auto *IO = dyn_cast<Instruction>(V);
1851 if (!IO)
1852 return true;
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1854 });
1855}
1856
1857/// Checks if the provided value does not require scheduling. It does not
1858/// require scheduling if this is not an instruction or it is an instruction
1859/// that does not read/write memory and all users are phi nodes or instructions
1860/// from the different blocks.
1861static bool isUsedOutsideBlock(Value *V) {
1862 auto *I = dyn_cast<Instruction>(V);
1863 if (!I)
1864 return true;
1865 // Limits the number of uses to save compile time.
1866 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1867 all_of(I->users(), [I](User *U) {
1868 auto *IU = dyn_cast<Instruction>(U);
1869 if (!IU)
1870 return true;
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1872 });
1873}
1874
1875/// Checks if the specified value does not require scheduling. It does not
1876/// require scheduling if all operands and all users do not need to be scheduled
1877/// in the current basic block.
1880}
1881
1882/// Checks if the specified array of instructions does not require scheduling.
1883/// It is so if all either instructions have operands that do not require
1884/// scheduling or their users do not require scheduling since they are phis or
1885/// in other basic blocks.
1887 return !VL.empty() &&
1889}
1890
1891/// Returns true if widened type of \p Ty elements with size \p Sz represents
1892/// full vector type, i.e. adding extra element results in extra parts upon type
1893/// legalization.
1895 unsigned Sz) {
1896 if (Sz <= 1)
1897 return false;
1899 return false;
1900 if (has_single_bit(Sz))
1901 return true;
1902 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1903 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1904 Sz % NumParts == 0;
1905}
1906
1907/// Returns number of parts, the type \p VecTy will be split at the codegen
1908/// phase. If the type is going to be scalarized or does not uses whole
1909/// registers, returns 1.
1910static unsigned
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1915 return 1;
1916 unsigned Sz = getNumElements(VecTy);
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1918 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1919 return 1;
1920 return NumParts;
1921}
1922
1923namespace slpvectorizer {
1924
1925/// Bottom Up SLP Vectorizer.
1926class BoUpSLP {
1927 class TreeEntry;
1928 class ScheduleEntity;
1929 class ScheduleData;
1930 class ScheduleCopyableData;
1931 class ScheduleBundle;
1934
1935 /// If we decide to generate strided load / store, this struct contains all
1936 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1937 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1938 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1939 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1940 /// size of element of FixedVectorType.
1941 struct StridedPtrInfo {
1942 Value *StrideVal = nullptr;
1943 const SCEV *StrideSCEV = nullptr;
1944 FixedVectorType *Ty = nullptr;
1945 };
1946 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1947
1948public:
1949 /// Tracks the state we can represent the loads in the given sequence.
1957
1964
1966 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1968 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1969 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1970 AC(AC), DB(DB), DL(DL), ORE(ORE),
1971 Builder(Se->getContext(), TargetFolder(*DL)) {
1972 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1973 // Use the vector register size specified by the target unless overridden
1974 // by a command-line option.
1975 // TODO: It would be better to limit the vectorization factor based on
1976 // data type rather than just register size. For example, x86 AVX has
1977 // 256-bit registers, but it does not support integer operations
1978 // at that width (that requires AVX2).
1979 if (MaxVectorRegSizeOption.getNumOccurrences())
1980 MaxVecRegSize = MaxVectorRegSizeOption;
1981 else
1982 MaxVecRegSize =
1983 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1984 .getFixedValue();
1985
1986 if (MinVectorRegSizeOption.getNumOccurrences())
1987 MinVecRegSize = MinVectorRegSizeOption;
1988 else
1989 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1990 }
1991
1992 /// Vectorize the tree that starts with the elements in \p VL.
1993 /// Returns the vectorized root.
1995
1996 /// Vectorize the tree but with the list of externally used values \p
1997 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1998 /// generated extractvalue instructions.
2000 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2001 Instruction *ReductionRoot = nullptr,
2002 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2003
2004 /// \returns the cost incurred by unwanted spills and fills, caused by
2005 /// holding live values over call sites.
2007
2008 /// \returns the vectorization cost of the subtree that starts at \p VL.
2009 /// A negative number means that this is profitable.
2010 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
2011 InstructionCost ReductionCost = TTI::TCC_Free);
2012
2013 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2014 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2015 void buildTree(ArrayRef<Value *> Roots,
2016 const SmallDenseSet<Value *> &UserIgnoreLst);
2017
2018 /// Construct a vectorizable tree that starts at \p Roots.
2019 void buildTree(ArrayRef<Value *> Roots);
2020
2021 /// Return the scalars of the root node.
2023 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2024 return VectorizableTree.front()->Scalars;
2025 }
2026
2027 /// Returns the type/is-signed info for the root node in the graph without
2028 /// casting.
2029 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2030 const TreeEntry &Root = *VectorizableTree.front();
2031 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2032 !Root.Scalars.front()->getType()->isIntegerTy())
2033 return std::nullopt;
2034 auto It = MinBWs.find(&Root);
2035 if (It != MinBWs.end())
2036 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2037 It->second.first),
2038 It->second.second);
2039 if (Root.getOpcode() == Instruction::ZExt ||
2040 Root.getOpcode() == Instruction::SExt)
2041 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2042 Root.getOpcode() == Instruction::SExt);
2043 return std::nullopt;
2044 }
2045
2046 /// Checks if the root graph node can be emitted with narrower bitwidth at
2047 /// codegen and returns it signedness, if so.
2049 return MinBWs.at(VectorizableTree.front().get()).second;
2050 }
2051
2052 /// Returns reduction type after minbitdth analysis.
2054 if (ReductionBitWidth == 0 ||
2055 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2056 ReductionBitWidth >=
2057 DL->getTypeSizeInBits(
2058 VectorizableTree.front()->Scalars.front()->getType()))
2059 return getWidenedType(
2060 VectorizableTree.front()->Scalars.front()->getType(),
2061 VectorizableTree.front()->getVectorFactor());
2062 return getWidenedType(
2064 VectorizableTree.front()->Scalars.front()->getContext(),
2065 ReductionBitWidth),
2066 VectorizableTree.front()->getVectorFactor());
2067 }
2068
2069 /// Builds external uses of the vectorized scalars, i.e. the list of
2070 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2071 /// ExternallyUsedValues contains additional list of external uses to handle
2072 /// vectorization of reductions.
2073 void
2074 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2075
2076 /// Transforms graph nodes to target specific representations, if profitable.
2077 void transformNodes();
2078
2079 /// Clear the internal data structures that are created by 'buildTree'.
2080 void deleteTree() {
2081 VectorizableTree.clear();
2082 ScalarToTreeEntries.clear();
2083 OperandsToTreeEntry.clear();
2084 ScalarsInSplitNodes.clear();
2085 MustGather.clear();
2086 NonScheduledFirst.clear();
2087 EntryToLastInstruction.clear();
2088 LoadEntriesToVectorize.clear();
2089 IsGraphTransformMode = false;
2090 GatheredLoadsEntriesFirst.reset();
2091 CompressEntryToData.clear();
2092 ExternalUses.clear();
2093 ExternalUsesAsOriginalScalar.clear();
2094 ExternalUsesWithNonUsers.clear();
2095 for (auto &Iter : BlocksSchedules) {
2096 BlockScheduling *BS = Iter.second.get();
2097 BS->clear();
2098 }
2099 MinBWs.clear();
2100 ReductionBitWidth = 0;
2101 BaseGraphSize = 1;
2102 CastMaxMinBWSizes.reset();
2103 ExtraBitWidthNodes.clear();
2104 InstrElementSize.clear();
2105 UserIgnoreList = nullptr;
2106 PostponedGathers.clear();
2107 ValueToGatherNodes.clear();
2108 TreeEntryToStridedPtrInfoMap.clear();
2109 }
2110
2111 unsigned getTreeSize() const { return VectorizableTree.size(); }
2112
2113 /// Returns the base graph size, before any transformations.
2114 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2115
2116 /// Perform LICM and CSE on the newly generated gather sequences.
2118
2119 /// Does this non-empty order represent an identity order? Identity
2120 /// should be represented as an empty order, so this is used to
2121 /// decide if we can canonicalize a computed order. Undef elements
2122 /// (represented as size) are ignored.
2124 assert(!Order.empty() && "expected non-empty order");
2125 const unsigned Sz = Order.size();
2126 return all_of(enumerate(Order), [&](const auto &P) {
2127 return P.value() == P.index() || P.value() == Sz;
2128 });
2129 }
2130
2131 /// Checks if the specified gather tree entry \p TE can be represented as a
2132 /// shuffled vector entry + (possibly) permutation with other gathers. It
2133 /// implements the checks only for possibly ordered scalars (Loads,
2134 /// ExtractElement, ExtractValue), which can be part of the graph.
2135 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2136 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2137 /// node might be ignored.
2138 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2139 bool TopToBottom,
2140 bool IgnoreReorder);
2141
2142 /// Sort loads into increasing pointers offsets to allow greater clustering.
2143 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2144
2145 /// Gets reordering data for the given tree entry. If the entry is vectorized
2146 /// - just return ReorderIndices, otherwise check if the scalars can be
2147 /// reordered and return the most optimal order.
2148 /// \return std::nullopt if ordering is not important, empty order, if
2149 /// identity order is important, or the actual order.
2150 /// \param TopToBottom If true, include the order of vectorized stores and
2151 /// insertelement nodes, otherwise skip them.
2152 /// \param IgnoreReorder true, if the root node order can be ignored.
2153 std::optional<OrdersType>
2154 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2155
2156 /// Checks if it is profitable to reorder the current tree.
2157 /// If the tree does not contain many profitable reordable nodes, better to
2158 /// skip it to save compile time.
2159 bool isProfitableToReorder() const;
2160
2161 /// Reorders the current graph to the most profitable order starting from the
2162 /// root node to the leaf nodes. The best order is chosen only from the nodes
2163 /// of the same size (vectorization factor). Smaller nodes are considered
2164 /// parts of subgraph with smaller VF and they are reordered independently. We
2165 /// can make it because we still need to extend smaller nodes to the wider VF
2166 /// and we can merge reordering shuffles with the widening shuffles.
2167 void reorderTopToBottom();
2168
2169 /// Reorders the current graph to the most profitable order starting from
2170 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2171 /// number of reshuffles if the leaf nodes use the same order. In this case we
2172 /// can merge the orders and just shuffle user node instead of shuffling its
2173 /// operands. Plus, even the leaf nodes have different orders, it allows to
2174 /// sink reordering in the graph closer to the root node and merge it later
2175 /// during analysis.
2176 void reorderBottomToTop(bool IgnoreReorder = false);
2177
2178 /// \return The vector element size in bits to use when vectorizing the
2179 /// expression tree ending at \p V. If V is a store, the size is the width of
2180 /// the stored value. Otherwise, the size is the width of the largest loaded
2181 /// value reaching V. This method is used by the vectorizer to calculate
2182 /// vectorization factors.
2183 unsigned getVectorElementSize(Value *V);
2184
2185 /// Compute the minimum type sizes required to represent the entries in a
2186 /// vectorizable tree.
2188
2189 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2190 unsigned getMaxVecRegSize() const {
2191 return MaxVecRegSize;
2192 }
2193
2194 // \returns minimum vector register size as set by cl::opt.
2195 unsigned getMinVecRegSize() const {
2196 return MinVecRegSize;
2197 }
2198
2199 unsigned getMinVF(unsigned Sz) const {
2200 return std::max(2U, getMinVecRegSize() / Sz);
2201 }
2202
2203 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2204 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2205 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2206 return MaxVF ? MaxVF : UINT_MAX;
2207 }
2208
2209 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2210 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2211 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2212 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2213 ///
2214 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2215 unsigned canMapToVector(Type *T) const;
2216
2217 /// \returns True if the VectorizableTree is both tiny and not fully
2218 /// vectorizable. We do not vectorize such trees.
2219 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2220
2221 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2222 /// It may happen, if all gather nodes are loads and they cannot be
2223 /// "clusterized". In this case even subgraphs cannot be vectorized more
2224 /// effectively than the base graph.
2225 bool isTreeNotExtendable() const;
2226
2227 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2228 /// can be load combined in the backend. Load combining may not be allowed in
2229 /// the IR optimizer, so we do not want to alter the pattern. For example,
2230 /// partially transforming a scalar bswap() pattern into vector code is
2231 /// effectively impossible for the backend to undo.
2232 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2233 /// may not be necessary.
2234 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2235
2236 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2237 /// can be load combined in the backend. Load combining may not be allowed in
2238 /// the IR optimizer, so we do not want to alter the pattern. For example,
2239 /// partially transforming a scalar bswap() pattern into vector code is
2240 /// effectively impossible for the backend to undo.
2241 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2242 /// may not be necessary.
2243 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2244 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2245 Align Alignment, const int64_t Diff, Value *Ptr0,
2246 Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2247
2248 /// Checks if the given array of loads can be represented as a vectorized,
2249 /// scatter or just simple gather.
2250 /// \param VL list of loads.
2251 /// \param VL0 main load value.
2252 /// \param Order returned order of load instructions.
2253 /// \param PointerOps returned list of pointer operands.
2254 /// \param BestVF return best vector factor, if recursive check found better
2255 /// vectorization sequences rather than masked gather.
2256 /// \param TryRecursiveCheck used to check if long masked gather can be
2257 /// represented as a serie of loads/insert subvector, if profitable.
2260 SmallVectorImpl<Value *> &PointerOps,
2261 StridedPtrInfo &SPtrInfo,
2262 unsigned *BestVF = nullptr,
2263 bool TryRecursiveCheck = true) const;
2264
2265 /// Registers non-vectorizable sequence of loads
2266 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2267 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2268 }
2269
2270 /// Checks if the given loads sequence is known as not vectorizable
2271 template <typename T>
2273 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2274 }
2275
2277
2278 /// This structure holds any data we need about the edges being traversed
2279 /// during buildTreeRec(). We keep track of:
2280 /// (i) the user TreeEntry index, and
2281 /// (ii) the index of the edge.
2282 struct EdgeInfo {
2283 EdgeInfo() = default;
2284 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2286 /// The user TreeEntry.
2287 TreeEntry *UserTE = nullptr;
2288 /// The operand index of the use.
2289 unsigned EdgeIdx = UINT_MAX;
2290#ifndef NDEBUG
2292 const BoUpSLP::EdgeInfo &EI) {
2293 EI.dump(OS);
2294 return OS;
2295 }
2296 /// Debug print.
2297 void dump(raw_ostream &OS) const {
2298 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2299 << " EdgeIdx:" << EdgeIdx << "}";
2300 }
2301 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2302#endif
2303 bool operator == (const EdgeInfo &Other) const {
2304 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2305 }
2306
2307 operator bool() const { return UserTE != nullptr; }
2308 };
2309 friend struct DenseMapInfo<EdgeInfo>;
2310
2311 /// A helper class used for scoring candidates for two consecutive lanes.
2313 const TargetLibraryInfo &TLI;
2314 const DataLayout &DL;
2315 ScalarEvolution &SE;
2316 const BoUpSLP &R;
2317 int NumLanes; // Total number of lanes (aka vectorization factor).
2318 int MaxLevel; // The maximum recursion depth for accumulating score.
2319
2320 public:
2322 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2323 int MaxLevel)
2324 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2325 MaxLevel(MaxLevel) {}
2326
2327 // The hard-coded scores listed here are not very important, though it shall
2328 // be higher for better matches to improve the resulting cost. When
2329 // computing the scores of matching one sub-tree with another, we are
2330 // basically counting the number of values that are matching. So even if all
2331 // scores are set to 1, we would still get a decent matching result.
2332 // However, sometimes we have to break ties. For example we may have to
2333 // choose between matching loads vs matching opcodes. This is what these
2334 // scores are helping us with: they provide the order of preference. Also,
2335 // this is important if the scalar is externally used or used in another
2336 // tree entry node in the different lane.
2337
2338 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2339 static const int ScoreConsecutiveLoads = 4;
2340 /// The same load multiple times. This should have a better score than
2341 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2342 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2343 /// a vector load and 1.0 for a broadcast.
2344 static const int ScoreSplatLoads = 3;
2345 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2346 static const int ScoreReversedLoads = 3;
2347 /// A load candidate for masked gather.
2348 static const int ScoreMaskedGatherCandidate = 1;
2349 /// ExtractElementInst from same vector and consecutive indexes.
2350 static const int ScoreConsecutiveExtracts = 4;
2351 /// ExtractElementInst from same vector and reversed indices.
2352 static const int ScoreReversedExtracts = 3;
2353 /// Constants.
2354 static const int ScoreConstants = 2;
2355 /// Instructions with the same opcode.
2356 static const int ScoreSameOpcode = 2;
2357 /// Instructions with alt opcodes (e.g, add + sub).
2358 static const int ScoreAltOpcodes = 1;
2359 /// Identical instructions (a.k.a. splat or broadcast).
2360 static const int ScoreSplat = 1;
2361 /// Matching with an undef is preferable to failing.
2362 static const int ScoreUndef = 1;
2363 /// Score for failing to find a decent match.
2364 static const int ScoreFail = 0;
2365 /// Score if all users are vectorized.
2366 static const int ScoreAllUserVectorized = 1;
2367
2368 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2369 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2370 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2371 /// MainAltOps.
2373 ArrayRef<Value *> MainAltOps) const {
2374 if (!isValidElementType(V1->getType()) ||
2377
2378 if (V1 == V2) {
2379 if (isa<LoadInst>(V1)) {
2380 // Retruns true if the users of V1 and V2 won't need to be extracted.
2381 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2382 // Bail out if we have too many uses to save compilation time.
2383 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2384 return false;
2385
2386 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2387 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2388 return U == U1 || U == U2 || R.isVectorized(U);
2389 });
2390 };
2391 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2392 };
2393 // A broadcast of a load can be cheaper on some targets.
2394 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2395 ElementCount::getFixed(NumLanes)) &&
2396 ((int)V1->getNumUses() == NumLanes ||
2397 AllUsersAreInternal(V1, V2)))
2399 }
2401 }
2402
2403 auto CheckSameEntryOrFail = [&]() {
2404 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2406 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2407 !TEs2.empty() &&
2408 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2410 }
2412 };
2413
2414 auto *LI1 = dyn_cast<LoadInst>(V1);
2415 auto *LI2 = dyn_cast<LoadInst>(V2);
2416 if (LI1 && LI2) {
2417 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2418 !LI2->isSimple())
2419 return CheckSameEntryOrFail();
2420
2421 std::optional<int64_t> Dist = getPointersDiff(
2422 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2423 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2424 if (!Dist || *Dist == 0) {
2425 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2426 getUnderlyingObject(LI2->getPointerOperand()) &&
2427 R.TTI->isLegalMaskedGather(
2428 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2430 return CheckSameEntryOrFail();
2431 }
2432 // The distance is too large - still may be profitable to use masked
2433 // loads/gathers.
2434 if (std::abs(*Dist) > NumLanes / 2)
2436 // This still will detect consecutive loads, but we might have "holes"
2437 // in some cases. It is ok for non-power-2 vectorization and may produce
2438 // better results. It should not affect current vectorization.
2441 }
2442
2443 auto *C1 = dyn_cast<Constant>(V1);
2444 auto *C2 = dyn_cast<Constant>(V2);
2445 if (C1 && C2)
2447
2448 // Consider constants and buildvector compatible.
2449 if ((C1 && isa<InsertElementInst>(V2)) ||
2450 (C2 && isa<InsertElementInst>(V1)))
2452
2453 // Extracts from consecutive indexes of the same vector better score as
2454 // the extracts could be optimized away.
2455 Value *EV1;
2456 ConstantInt *Ex1Idx;
2457 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2458 // Undefs are always profitable for extractelements.
2459 // Compiler can easily combine poison and extractelement <non-poison> or
2460 // undef and extractelement <poison>. But combining undef +
2461 // extractelement <non-poison-but-may-produce-poison> requires some
2462 // extra operations.
2463 if (isa<UndefValue>(V2))
2464 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2467 Value *EV2 = nullptr;
2468 ConstantInt *Ex2Idx = nullptr;
2469 if (match(V2,
2471 m_Undef())))) {
2472 // Undefs are always profitable for extractelements.
2473 if (!Ex2Idx)
2475 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2477 if (EV2 == EV1) {
2478 int Idx1 = Ex1Idx->getZExtValue();
2479 int Idx2 = Ex2Idx->getZExtValue();
2480 int Dist = Idx2 - Idx1;
2481 // The distance is too large - still may be profitable to use
2482 // shuffles.
2483 if (std::abs(Dist) == 0)
2485 if (std::abs(Dist) > NumLanes / 2)
2489 }
2491 }
2492 return CheckSameEntryOrFail();
2493 }
2494
2495 auto *I1 = dyn_cast<Instruction>(V1);
2496 auto *I2 = dyn_cast<Instruction>(V2);
2497 if (I1 && I2) {
2498 if (I1->getParent() != I2->getParent())
2499 return CheckSameEntryOrFail();
2500 SmallVector<Value *, 4> Ops(MainAltOps);
2501 Ops.push_back(I1);
2502 Ops.push_back(I2);
2503 InstructionsState S = getSameOpcode(Ops, TLI);
2504 // Note: Only consider instructions with <= 2 operands to avoid
2505 // complexity explosion.
2506 if (S &&
2507 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2508 !S.isAltShuffle()) &&
2509 all_of(Ops, [&S](Value *V) {
2510 return isa<PoisonValue>(V) ||
2511 cast<Instruction>(V)->getNumOperands() ==
2512 S.getMainOp()->getNumOperands();
2513 }))
2514 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2516 }
2517
2518 if (I1 && isa<PoisonValue>(V2))
2520
2521 if (isa<UndefValue>(V2))
2523
2524 return CheckSameEntryOrFail();
2525 }
2526
2527 /// Go through the operands of \p LHS and \p RHS recursively until
2528 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2529 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2530 /// of \p U1 and \p U2), except at the beginning of the recursion where
2531 /// these are set to nullptr.
2532 ///
2533 /// For example:
2534 /// \verbatim
2535 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2536 /// \ / \ / \ / \ /
2537 /// + + + +
2538 /// G1 G2 G3 G4
2539 /// \endverbatim
2540 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2541 /// each level recursively, accumulating the score. It starts from matching
2542 /// the additions at level 0, then moves on to the loads (level 1). The
2543 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2544 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2545 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2546 /// Please note that the order of the operands does not matter, as we
2547 /// evaluate the score of all profitable combinations of operands. In
2548 /// other words the score of G1 and G4 is the same as G1 and G2. This
2549 /// heuristic is based on ideas described in:
2550 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2551 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2552 /// Luís F. W. Góes
2554 Instruction *U2, int CurrLevel,
2555 ArrayRef<Value *> MainAltOps) const {
2556
2557 // Get the shallow score of V1 and V2.
2558 int ShallowScoreAtThisLevel =
2559 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2560
2561 // If reached MaxLevel,
2562 // or if V1 and V2 are not instructions,
2563 // or if they are SPLAT,
2564 // or if they are not consecutive,
2565 // or if profitable to vectorize loads or extractelements, early return
2566 // the current cost.
2567 auto *I1 = dyn_cast<Instruction>(LHS);
2568 auto *I2 = dyn_cast<Instruction>(RHS);
2569 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2570 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2571 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2572 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2574 ShallowScoreAtThisLevel))
2575 return ShallowScoreAtThisLevel;
2576 assert(I1 && I2 && "Should have early exited.");
2577
2578 // Contains the I2 operand indexes that got matched with I1 operands.
2579 SmallSet<unsigned, 4> Op2Used;
2580
2581 // Recursion towards the operands of I1 and I2. We are trying all possible
2582 // operand pairs, and keeping track of the best score.
2583 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2584 OpIdx1 != NumOperands1; ++OpIdx1) {
2585 // Try to pair op1I with the best operand of I2.
2586 int MaxTmpScore = 0;
2587 unsigned MaxOpIdx2 = 0;
2588 bool FoundBest = false;
2589 // If I2 is commutative try all combinations.
2590 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2591 unsigned ToIdx = isCommutative(I2)
2592 ? I2->getNumOperands()
2593 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2594 assert(FromIdx <= ToIdx && "Bad index");
2595 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2596 // Skip operands already paired with OpIdx1.
2597 if (Op2Used.count(OpIdx2))
2598 continue;
2599 // Recursively calculate the cost at each level
2600 int TmpScore =
2601 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2602 I1, I2, CurrLevel + 1, {});
2603 // Look for the best score.
2604 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2605 TmpScore > MaxTmpScore) {
2606 MaxTmpScore = TmpScore;
2607 MaxOpIdx2 = OpIdx2;
2608 FoundBest = true;
2609 }
2610 }
2611 if (FoundBest) {
2612 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2613 Op2Used.insert(MaxOpIdx2);
2614 ShallowScoreAtThisLevel += MaxTmpScore;
2615 }
2616 }
2617 return ShallowScoreAtThisLevel;
2618 }
2619 };
2620 /// A helper data structure to hold the operands of a vector of instructions.
2621 /// This supports a fixed vector length for all operand vectors.
2623 /// For each operand we need (i) the value, and (ii) the opcode that it
2624 /// would be attached to if the expression was in a left-linearized form.
2625 /// This is required to avoid illegal operand reordering.
2626 /// For example:
2627 /// \verbatim
2628 /// 0 Op1
2629 /// |/
2630 /// Op1 Op2 Linearized + Op2
2631 /// \ / ----------> |/
2632 /// - -
2633 ///
2634 /// Op1 - Op2 (0 + Op1) - Op2
2635 /// \endverbatim
2636 ///
2637 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2638 ///
2639 /// Another way to think of this is to track all the operations across the
2640 /// path from the operand all the way to the root of the tree and to
2641 /// calculate the operation that corresponds to this path. For example, the
2642 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2643 /// corresponding operation is a '-' (which matches the one in the
2644 /// linearized tree, as shown above).
2645 ///
2646 /// For lack of a better term, we refer to this operation as Accumulated
2647 /// Path Operation (APO).
2648 struct OperandData {
2649 OperandData() = default;
2650 OperandData(Value *V, bool APO, bool IsUsed)
2651 : V(V), APO(APO), IsUsed(IsUsed) {}
2652 /// The operand value.
2653 Value *V = nullptr;
2654 /// TreeEntries only allow a single opcode, or an alternate sequence of
2655 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2656 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2657 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2658 /// (e.g., Add/Mul)
2659 bool APO = false;
2660 /// Helper data for the reordering function.
2661 bool IsUsed = false;
2662 };
2663
2664 /// During operand reordering, we are trying to select the operand at lane
2665 /// that matches best with the operand at the neighboring lane. Our
2666 /// selection is based on the type of value we are looking for. For example,
2667 /// if the neighboring lane has a load, we need to look for a load that is
2668 /// accessing a consecutive address. These strategies are summarized in the
2669 /// 'ReorderingMode' enumerator.
2670 enum class ReorderingMode {
2671 Load, ///< Matching loads to consecutive memory addresses
2672 Opcode, ///< Matching instructions based on opcode (same or alternate)
2673 Constant, ///< Matching constants
2674 Splat, ///< Matching the same instruction multiple times (broadcast)
2675 Failed, ///< We failed to create a vectorizable group
2676 };
2677
2678 using OperandDataVec = SmallVector<OperandData, 2>;
2679
2680 /// A vector of operand vectors.
2682 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2683 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2684 unsigned ArgSize = 0;
2685
2686 const TargetLibraryInfo &TLI;
2687 const DataLayout &DL;
2688 ScalarEvolution &SE;
2689 const BoUpSLP &R;
2690 const Loop *L = nullptr;
2691
2692 /// \returns the operand data at \p OpIdx and \p Lane.
2693 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2694 return OpsVec[OpIdx][Lane];
2695 }
2696
2697 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2698 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2699 return OpsVec[OpIdx][Lane];
2700 }
2701
2702 /// Clears the used flag for all entries.
2703 void clearUsed() {
2704 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2705 OpIdx != NumOperands; ++OpIdx)
2706 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2707 ++Lane)
2708 OpsVec[OpIdx][Lane].IsUsed = false;
2709 }
2710
2711 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2712 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2713 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2714 }
2715
2716 /// \param Lane lane of the operands under analysis.
2717 /// \param OpIdx operand index in \p Lane lane we're looking the best
2718 /// candidate for.
2719 /// \param Idx operand index of the current candidate value.
2720 /// \returns The additional score due to possible broadcasting of the
2721 /// elements in the lane. It is more profitable to have power-of-2 unique
2722 /// elements in the lane, it will be vectorized with higher probability
2723 /// after removing duplicates. Currently the SLP vectorizer supports only
2724 /// vectorization of the power-of-2 number of unique scalars.
2725 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2726 const SmallBitVector &UsedLanes) const {
2727 Value *IdxLaneV = getData(Idx, Lane).V;
2728 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2729 isa<ExtractElementInst>(IdxLaneV))
2730 return 0;
2732 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2733 if (Ln == Lane)
2734 continue;
2735 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2736 if (!isa<Instruction>(OpIdxLnV))
2737 return 0;
2738 Uniques.try_emplace(OpIdxLnV, Ln);
2739 }
2740 unsigned UniquesCount = Uniques.size();
2741 auto IdxIt = Uniques.find(IdxLaneV);
2742 unsigned UniquesCntWithIdxLaneV =
2743 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2744 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2745 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2746 unsigned UniquesCntWithOpIdxLaneV =
2747 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2748 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2749 return 0;
2750 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2751 UniquesCntWithOpIdxLaneV,
2752 UniquesCntWithOpIdxLaneV -
2753 bit_floor(UniquesCntWithOpIdxLaneV)) -
2754 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2755 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2756 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2757 }
2758
2759 /// \param Lane lane of the operands under analysis.
2760 /// \param OpIdx operand index in \p Lane lane we're looking the best
2761 /// candidate for.
2762 /// \param Idx operand index of the current candidate value.
2763 /// \returns The additional score for the scalar which users are all
2764 /// vectorized.
2765 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2766 Value *IdxLaneV = getData(Idx, Lane).V;
2767 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2768 // Do not care about number of uses for vector-like instructions
2769 // (extractelement/extractvalue with constant indices), they are extracts
2770 // themselves and already externally used. Vectorization of such
2771 // instructions does not add extra extractelement instruction, just may
2772 // remove it.
2773 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2774 isVectorLikeInstWithConstOps(OpIdxLaneV))
2776 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2777 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2778 return 0;
2779 return R.areAllUsersVectorized(IdxLaneI)
2781 : 0;
2782 }
2783
2784 /// Score scaling factor for fully compatible instructions but with
2785 /// different number of external uses. Allows better selection of the
2786 /// instructions with less external uses.
2787 static const int ScoreScaleFactor = 10;
2788
2789 /// \Returns the look-ahead score, which tells us how much the sub-trees
2790 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2791 /// score. This helps break ties in an informed way when we cannot decide on
2792 /// the order of the operands by just considering the immediate
2793 /// predecessors.
2794 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2795 int Lane, unsigned OpIdx, unsigned Idx,
2796 bool &IsUsed, const SmallBitVector &UsedLanes) {
2797 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2799 // Keep track of the instruction stack as we recurse into the operands
2800 // during the look-ahead score exploration.
2801 int Score =
2802 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2803 /*CurrLevel=*/1, MainAltOps);
2804 if (Score) {
2805 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2806 if (Score <= -SplatScore) {
2807 // Failed score.
2808 Score = 0;
2809 } else {
2810 Score += SplatScore;
2811 // Scale score to see the difference between different operands
2812 // and similar operands but all vectorized/not all vectorized
2813 // uses. It does not affect actual selection of the best
2814 // compatible operand in general, just allows to select the
2815 // operand with all vectorized uses.
2816 Score *= ScoreScaleFactor;
2817 Score += getExternalUseScore(Lane, OpIdx, Idx);
2818 IsUsed = true;
2819 }
2820 }
2821 return Score;
2822 }
2823
2824 /// Best defined scores per lanes between the passes. Used to choose the
2825 /// best operand (with the highest score) between the passes.
2826 /// The key - {Operand Index, Lane}.
2827 /// The value - the best score between the passes for the lane and the
2828 /// operand.
2830 BestScoresPerLanes;
2831
2832 // Search all operands in Ops[*][Lane] for the one that matches best
2833 // Ops[OpIdx][LastLane] and return its opreand index.
2834 // If no good match can be found, return std::nullopt.
2835 std::optional<unsigned>
2836 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2837 ArrayRef<ReorderingMode> ReorderingModes,
2838 ArrayRef<Value *> MainAltOps,
2839 const SmallBitVector &UsedLanes) {
2840 unsigned NumOperands = getNumOperands();
2841
2842 // The operand of the previous lane at OpIdx.
2843 Value *OpLastLane = getData(OpIdx, LastLane).V;
2844
2845 // Our strategy mode for OpIdx.
2846 ReorderingMode RMode = ReorderingModes[OpIdx];
2847 if (RMode == ReorderingMode::Failed)
2848 return std::nullopt;
2849
2850 // The linearized opcode of the operand at OpIdx, Lane.
2851 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2852
2853 // The best operand index and its score.
2854 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2855 // are using the score to differentiate between the two.
2856 struct BestOpData {
2857 std::optional<unsigned> Idx;
2858 unsigned Score = 0;
2859 } BestOp;
2860 BestOp.Score =
2861 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2862 .first->second;
2863
2864 // Track if the operand must be marked as used. If the operand is set to
2865 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2866 // want to reestimate the operands again on the following iterations).
2867 bool IsUsed = RMode == ReorderingMode::Splat ||
2868 RMode == ReorderingMode::Constant ||
2869 RMode == ReorderingMode::Load;
2870 // Iterate through all unused operands and look for the best.
2871 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2872 // Get the operand at Idx and Lane.
2873 OperandData &OpData = getData(Idx, Lane);
2874 Value *Op = OpData.V;
2875 bool OpAPO = OpData.APO;
2876
2877 // Skip already selected operands.
2878 if (OpData.IsUsed)
2879 continue;
2880
2881 // Skip if we are trying to move the operand to a position with a
2882 // different opcode in the linearized tree form. This would break the
2883 // semantics.
2884 if (OpAPO != OpIdxAPO)
2885 continue;
2886
2887 // Look for an operand that matches the current mode.
2888 switch (RMode) {
2889 case ReorderingMode::Load:
2890 case ReorderingMode::Opcode: {
2891 bool LeftToRight = Lane > LastLane;
2892 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2893 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2894 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2895 OpIdx, Idx, IsUsed, UsedLanes);
2896 if (Score > static_cast<int>(BestOp.Score) ||
2897 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2898 Idx == OpIdx)) {
2899 BestOp.Idx = Idx;
2900 BestOp.Score = Score;
2901 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2902 }
2903 break;
2904 }
2905 case ReorderingMode::Constant:
2906 if (isa<Constant>(Op) ||
2907 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2908 BestOp.Idx = Idx;
2909 if (isa<Constant>(Op)) {
2911 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2913 }
2915 IsUsed = false;
2916 }
2917 break;
2918 case ReorderingMode::Splat:
2919 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2920 IsUsed = Op == OpLastLane;
2921 if (Op == OpLastLane) {
2922 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2923 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2925 }
2926 BestOp.Idx = Idx;
2927 }
2928 break;
2929 case ReorderingMode::Failed:
2930 llvm_unreachable("Not expected Failed reordering mode.");
2931 }
2932 }
2933
2934 if (BestOp.Idx) {
2935 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2936 return BestOp.Idx;
2937 }
2938 // If we could not find a good match return std::nullopt.
2939 return std::nullopt;
2940 }
2941
2942 /// Helper for reorderOperandVecs.
2943 /// \returns the lane that we should start reordering from. This is the one
2944 /// which has the least number of operands that can freely move about or
2945 /// less profitable because it already has the most optimal set of operands.
2946 unsigned getBestLaneToStartReordering() const {
2947 unsigned Min = UINT_MAX;
2948 unsigned SameOpNumber = 0;
2949 // std::pair<unsigned, unsigned> is used to implement a simple voting
2950 // algorithm and choose the lane with the least number of operands that
2951 // can freely move about or less profitable because it already has the
2952 // most optimal set of operands. The first unsigned is a counter for
2953 // voting, the second unsigned is the counter of lanes with instructions
2954 // with same/alternate opcodes and same parent basic block.
2956 // Try to be closer to the original results, if we have multiple lanes
2957 // with same cost. If 2 lanes have the same cost, use the one with the
2958 // highest index.
2959 for (int I = getNumLanes(); I > 0; --I) {
2960 unsigned Lane = I - 1;
2961 OperandsOrderData NumFreeOpsHash =
2962 getMaxNumOperandsThatCanBeReordered(Lane);
2963 // Compare the number of operands that can move and choose the one with
2964 // the least number.
2965 if (NumFreeOpsHash.NumOfAPOs < Min) {
2966 Min = NumFreeOpsHash.NumOfAPOs;
2967 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2968 HashMap.clear();
2969 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2970 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2971 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2972 // Select the most optimal lane in terms of number of operands that
2973 // should be moved around.
2974 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2975 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2976 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2977 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2978 auto [It, Inserted] =
2979 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2980 if (!Inserted)
2981 ++It->second.first;
2982 }
2983 }
2984 // Select the lane with the minimum counter.
2985 unsigned BestLane = 0;
2986 unsigned CntMin = UINT_MAX;
2987 for (const auto &Data : reverse(HashMap)) {
2988 if (Data.second.first < CntMin) {
2989 CntMin = Data.second.first;
2990 BestLane = Data.second.second;
2991 }
2992 }
2993 return BestLane;
2994 }
2995
2996 /// Data structure that helps to reorder operands.
2997 struct OperandsOrderData {
2998 /// The best number of operands with the same APOs, which can be
2999 /// reordered.
3000 unsigned NumOfAPOs = UINT_MAX;
3001 /// Number of operands with the same/alternate instruction opcode and
3002 /// parent.
3003 unsigned NumOpsWithSameOpcodeParent = 0;
3004 /// Hash for the actual operands ordering.
3005 /// Used to count operands, actually their position id and opcode
3006 /// value. It is used in the voting mechanism to find the lane with the
3007 /// least number of operands that can freely move about or less profitable
3008 /// because it already has the most optimal set of operands. Can be
3009 /// replaced with SmallVector<unsigned> instead but hash code is faster
3010 /// and requires less memory.
3011 unsigned Hash = 0;
3012 };
3013 /// \returns the maximum number of operands that are allowed to be reordered
3014 /// for \p Lane and the number of compatible instructions(with the same
3015 /// parent/opcode). This is used as a heuristic for selecting the first lane
3016 /// to start operand reordering.
3017 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3018 unsigned CntTrue = 0;
3019 unsigned NumOperands = getNumOperands();
3020 // Operands with the same APO can be reordered. We therefore need to count
3021 // how many of them we have for each APO, like this: Cnt[APO] = x.
3022 // Since we only have two APOs, namely true and false, we can avoid using
3023 // a map. Instead we can simply count the number of operands that
3024 // correspond to one of them (in this case the 'true' APO), and calculate
3025 // the other by subtracting it from the total number of operands.
3026 // Operands with the same instruction opcode and parent are more
3027 // profitable since we don't need to move them in many cases, with a high
3028 // probability such lane already can be vectorized effectively.
3029 bool AllUndefs = true;
3030 unsigned NumOpsWithSameOpcodeParent = 0;
3031 Instruction *OpcodeI = nullptr;
3032 BasicBlock *Parent = nullptr;
3033 unsigned Hash = 0;
3034 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3035 const OperandData &OpData = getData(OpIdx, Lane);
3036 if (OpData.APO)
3037 ++CntTrue;
3038 // Use Boyer-Moore majority voting for finding the majority opcode and
3039 // the number of times it occurs.
3040 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3041 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3042 I->getParent() != Parent) {
3043 if (NumOpsWithSameOpcodeParent == 0) {
3044 NumOpsWithSameOpcodeParent = 1;
3045 OpcodeI = I;
3046 Parent = I->getParent();
3047 } else {
3048 --NumOpsWithSameOpcodeParent;
3049 }
3050 } else {
3051 ++NumOpsWithSameOpcodeParent;
3052 }
3053 }
3054 Hash = hash_combine(
3055 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3056 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3057 }
3058 if (AllUndefs)
3059 return {};
3060 OperandsOrderData Data;
3061 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3062 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3063 Data.Hash = Hash;
3064 return Data;
3065 }
3066
3067 /// Go through the instructions in VL and append their operands.
3068 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3069 const InstructionsState &S) {
3070 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3071 assert((empty() || all_of(Operands,
3072 [this](const ValueList &VL) {
3073 return VL.size() == getNumLanes();
3074 })) &&
3075 "Expected same number of lanes");
3076 assert(S.valid() && "InstructionsState is invalid.");
3077 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3078 // arguments to the intrinsic produces the same result.
3079 Instruction *MainOp = S.getMainOp();
3080 unsigned NumOperands = MainOp->getNumOperands();
3082 OpsVec.resize(ArgSize);
3083 unsigned NumLanes = VL.size();
3084 for (OperandDataVec &Ops : OpsVec)
3085 Ops.resize(NumLanes);
3086 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3087 // Our tree has just 3 nodes: the root and two operands.
3088 // It is therefore trivial to get the APO. We only need to check the
3089 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3090 // operand. The LHS operand of both add and sub is never attached to an
3091 // inversese operation in the linearized form, therefore its APO is
3092 // false. The RHS is true only if V is an inverse operation.
3093
3094 // Since operand reordering is performed on groups of commutative
3095 // operations or alternating sequences (e.g., +, -), we can safely tell
3096 // the inverse operations by checking commutativity.
3097 auto *I = dyn_cast<Instruction>(VL[Lane]);
3098 if (!I && isa<PoisonValue>(VL[Lane])) {
3099 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3100 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3101 continue;
3102 }
3103 bool IsInverseOperation = false;
3104 if (S.isCopyableElement(VL[Lane])) {
3105 // The value is a copyable element.
3106 IsInverseOperation = !isCommutative(MainOp, VL[Lane]);
3107 } else {
3108 assert(I && "Expected instruction");
3109 auto [SelectedOp, Ops] = convertTo(I, S);
3110 // We cannot check commutativity by the converted instruction
3111 // (SelectedOp) because isCommutative also examines def-use
3112 // relationships.
3113 IsInverseOperation = !isCommutative(SelectedOp, I);
3114 }
3115 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3116 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3117 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3118 }
3119 }
3120 }
3121
3122 /// \returns the number of operands.
3123 unsigned getNumOperands() const { return ArgSize; }
3124
3125 /// \returns the number of lanes.
3126 unsigned getNumLanes() const { return OpsVec[0].size(); }
3127
3128 /// \returns the operand value at \p OpIdx and \p Lane.
3129 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3130 return getData(OpIdx, Lane).V;
3131 }
3132
3133 /// \returns true if the data structure is empty.
3134 bool empty() const { return OpsVec.empty(); }
3135
3136 /// Clears the data.
3137 void clear() { OpsVec.clear(); }
3138
3139 /// \Returns true if there are enough operands identical to \p Op to fill
3140 /// the whole vector (it is mixed with constants or loop invariant values).
3141 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3142 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3143 assert(Op == getValue(OpIdx, Lane) &&
3144 "Op is expected to be getValue(OpIdx, Lane).");
3145 // Small number of loads - try load matching.
3146 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3147 return false;
3148 bool OpAPO = getData(OpIdx, Lane).APO;
3149 bool IsInvariant = L && L->isLoopInvariant(Op);
3150 unsigned Cnt = 0;
3151 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3152 if (Ln == Lane)
3153 continue;
3154 // This is set to true if we found a candidate for broadcast at Lane.
3155 bool FoundCandidate = false;
3156 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3157 OperandData &Data = getData(OpI, Ln);
3158 if (Data.APO != OpAPO || Data.IsUsed)
3159 continue;
3160 Value *OpILane = getValue(OpI, Lane);
3161 bool IsConstantOp = isa<Constant>(OpILane);
3162 // Consider the broadcast candidate if:
3163 // 1. Same value is found in one of the operands.
3164 if (Data.V == Op ||
3165 // 2. The operand in the given lane is not constant but there is a
3166 // constant operand in another lane (which can be moved to the
3167 // given lane). In this case we can represent it as a simple
3168 // permutation of constant and broadcast.
3169 (!IsConstantOp &&
3170 ((Lns > 2 && isa<Constant>(Data.V)) ||
3171 // 2.1. If we have only 2 lanes, need to check that value in the
3172 // next lane does not build same opcode sequence.
3173 (Lns == 2 &&
3174 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3175 isa<Constant>(Data.V)))) ||
3176 // 3. The operand in the current lane is loop invariant (can be
3177 // hoisted out) and another operand is also a loop invariant
3178 // (though not a constant). In this case the whole vector can be
3179 // hoisted out.
3180 // FIXME: need to teach the cost model about this case for better
3181 // estimation.
3182 (IsInvariant && !isa<Constant>(Data.V) &&
3183 !getSameOpcode({Op, Data.V}, TLI) &&
3184 L->isLoopInvariant(Data.V))) {
3185 FoundCandidate = true;
3186 Data.IsUsed = Data.V == Op;
3187 if (Data.V == Op)
3188 ++Cnt;
3189 break;
3190 }
3191 }
3192 if (!FoundCandidate)
3193 return false;
3194 }
3195 return getNumLanes() == 2 || Cnt > 1;
3196 }
3197
3198 /// Checks if there is at least single compatible operand in lanes other
3199 /// than \p Lane, compatible with the operand \p Op.
3200 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3201 assert(Op == getValue(OpIdx, Lane) &&
3202 "Op is expected to be getValue(OpIdx, Lane).");
3203 bool OpAPO = getData(OpIdx, Lane).APO;
3204 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3205 if (Ln == Lane)
3206 continue;
3207 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3208 const OperandData &Data = getData(OpI, Ln);
3209 if (Data.APO != OpAPO || Data.IsUsed)
3210 return true;
3211 Value *OpILn = getValue(OpI, Ln);
3212 return (L && L->isLoopInvariant(OpILn)) ||
3213 (getSameOpcode({Op, OpILn}, TLI) &&
3214 allSameBlock({Op, OpILn}));
3215 }))
3216 return true;
3217 }
3218 return false;
3219 }
3220
3221 public:
3222 /// Initialize with all the operands of the instruction vector \p RootVL.
3224 const InstructionsState &S, const BoUpSLP &R)
3225 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3226 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3227 // Append all the operands of RootVL.
3228 appendOperands(RootVL, Operands, S);
3229 }
3230
3231 /// \Returns a value vector with the operands across all lanes for the
3232 /// opearnd at \p OpIdx.
3233 ValueList getVL(unsigned OpIdx) const {
3234 ValueList OpVL(OpsVec[OpIdx].size());
3235 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3236 "Expected same num of lanes across all operands");
3237 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3238 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3239 return OpVL;
3240 }
3241
3242 // Performs operand reordering for 2 or more operands.
3243 // The original operands are in OrigOps[OpIdx][Lane].
3244 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3245 void reorder() {
3246 unsigned NumOperands = getNumOperands();
3247 unsigned NumLanes = getNumLanes();
3248 // Each operand has its own mode. We are using this mode to help us select
3249 // the instructions for each lane, so that they match best with the ones
3250 // we have selected so far.
3251 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3252
3253 // This is a greedy single-pass algorithm. We are going over each lane
3254 // once and deciding on the best order right away with no back-tracking.
3255 // However, in order to increase its effectiveness, we start with the lane
3256 // that has operands that can move the least. For example, given the
3257 // following lanes:
3258 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3259 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3260 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3261 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3262 // we will start at Lane 1, since the operands of the subtraction cannot
3263 // be reordered. Then we will visit the rest of the lanes in a circular
3264 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3265
3266 // Find the first lane that we will start our search from.
3267 unsigned FirstLane = getBestLaneToStartReordering();
3268
3269 // Initialize the modes.
3270 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3271 Value *OpLane0 = getValue(OpIdx, FirstLane);
3272 // Keep track if we have instructions with all the same opcode on one
3273 // side.
3274 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3275 // Check if OpLane0 should be broadcast.
3276 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3277 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3278 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3279 else if (isa<LoadInst>(OpILane0))
3280 ReorderingModes[OpIdx] = ReorderingMode::Load;
3281 else
3282 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3283 } else if (isa<Constant>(OpLane0)) {
3284 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3285 } else if (isa<Argument>(OpLane0)) {
3286 // Our best hope is a Splat. It may save some cost in some cases.
3287 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3288 } else {
3289 llvm_unreachable("Unexpected value kind.");
3290 }
3291 }
3292
3293 // Check that we don't have same operands. No need to reorder if operands
3294 // are just perfect diamond or shuffled diamond match. Do not do it only
3295 // for possible broadcasts or non-power of 2 number of scalars (just for
3296 // now).
3297 auto &&SkipReordering = [this]() {
3298 SmallPtrSet<Value *, 4> UniqueValues;
3299 ArrayRef<OperandData> Op0 = OpsVec.front();
3300 for (const OperandData &Data : Op0)
3301 UniqueValues.insert(Data.V);
3303 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3304 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3305 return !UniqueValues.contains(Data.V);
3306 }))
3307 return false;
3308 }
3309 // TODO: Check if we can remove a check for non-power-2 number of
3310 // scalars after full support of non-power-2 vectorization.
3311 return UniqueValues.size() != 2 &&
3312 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3313 UniqueValues.size());
3314 };
3315
3316 // If the initial strategy fails for any of the operand indexes, then we
3317 // perform reordering again in a second pass. This helps avoid assigning
3318 // high priority to the failed strategy, and should improve reordering for
3319 // the non-failed operand indexes.
3320 for (int Pass = 0; Pass != 2; ++Pass) {
3321 // Check if no need to reorder operands since they're are perfect or
3322 // shuffled diamond match.
3323 // Need to do it to avoid extra external use cost counting for
3324 // shuffled matches, which may cause regressions.
3325 if (SkipReordering())
3326 break;
3327 // Skip the second pass if the first pass did not fail.
3328 bool StrategyFailed = false;
3329 // Mark all operand data as free to use.
3330 clearUsed();
3331 // We keep the original operand order for the FirstLane, so reorder the
3332 // rest of the lanes. We are visiting the nodes in a circular fashion,
3333 // using FirstLane as the center point and increasing the radius
3334 // distance.
3335 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3336 for (unsigned I = 0; I < NumOperands; ++I)
3337 MainAltOps[I].push_back(getData(I, FirstLane).V);
3338
3339 SmallBitVector UsedLanes(NumLanes);
3340 UsedLanes.set(FirstLane);
3341 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3342 // Visit the lane on the right and then the lane on the left.
3343 for (int Direction : {+1, -1}) {
3344 int Lane = FirstLane + Direction * Distance;
3345 if (Lane < 0 || Lane >= (int)NumLanes)
3346 continue;
3347 UsedLanes.set(Lane);
3348 int LastLane = Lane - Direction;
3349 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3350 "Out of bounds");
3351 // Look for a good match for each operand.
3352 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3353 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3354 std::optional<unsigned> BestIdx =
3355 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3356 MainAltOps[OpIdx], UsedLanes);
3357 // By not selecting a value, we allow the operands that follow to
3358 // select a better matching value. We will get a non-null value in
3359 // the next run of getBestOperand().
3360 if (BestIdx) {
3361 // Swap the current operand with the one returned by
3362 // getBestOperand().
3363 swap(OpIdx, *BestIdx, Lane);
3364 } else {
3365 // Enable the second pass.
3366 StrategyFailed = true;
3367 }
3368 // Try to get the alternate opcode and follow it during analysis.
3369 if (MainAltOps[OpIdx].size() != 2) {
3370 OperandData &AltOp = getData(OpIdx, Lane);
3371 InstructionsState OpS =
3372 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3373 if (OpS && OpS.isAltShuffle())
3374 MainAltOps[OpIdx].push_back(AltOp.V);
3375 }
3376 }
3377 }
3378 }
3379 // Skip second pass if the strategy did not fail.
3380 if (!StrategyFailed)
3381 break;
3382 }
3383 }
3384
3385#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3386 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3387 switch (RMode) {
3388 case ReorderingMode::Load:
3389 return "Load";
3390 case ReorderingMode::Opcode:
3391 return "Opcode";
3392 case ReorderingMode::Constant:
3393 return "Constant";
3394 case ReorderingMode::Splat:
3395 return "Splat";
3396 case ReorderingMode::Failed:
3397 return "Failed";
3398 }
3399 llvm_unreachable("Unimplemented Reordering Type");
3400 }
3401
3402 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3403 raw_ostream &OS) {
3404 return OS << getModeStr(RMode);
3405 }
3406
3407 /// Debug print.
3408 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3409 printMode(RMode, dbgs());
3410 }
3411
3412 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3413 return printMode(RMode, OS);
3414 }
3415
3417 const unsigned Indent = 2;
3418 unsigned Cnt = 0;
3419 for (const OperandDataVec &OpDataVec : OpsVec) {
3420 OS << "Operand " << Cnt++ << "\n";
3421 for (const OperandData &OpData : OpDataVec) {
3422 OS.indent(Indent) << "{";
3423 if (Value *V = OpData.V)
3424 OS << *V;
3425 else
3426 OS << "null";
3427 OS << ", APO:" << OpData.APO << "}\n";
3428 }
3429 OS << "\n";
3430 }
3431 return OS;
3432 }
3433
3434 /// Debug print.
3435 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3436#endif
3437 };
3438
3439 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3440 /// for a pair which have highest score deemed to have best chance to form
3441 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3442 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3443 /// of the cost, considered to be good enough score.
3444 std::optional<int>
3445 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3446 int Limit = LookAheadHeuristics::ScoreFail) const {
3447 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3449 int BestScore = Limit;
3450 std::optional<int> Index;
3451 for (int I : seq<int>(0, Candidates.size())) {
3452 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3453 Candidates[I].second,
3454 /*U1=*/nullptr, /*U2=*/nullptr,
3455 /*CurrLevel=*/1, {});
3456 if (Score > BestScore) {
3457 BestScore = Score;
3458 Index = I;
3459 }
3460 }
3461 return Index;
3462 }
3463
3464 /// Checks if the instruction is marked for deletion.
3465 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3466
3467 /// Removes an instruction from its block and eventually deletes it.
3468 /// It's like Instruction::eraseFromParent() except that the actual deletion
3469 /// is delayed until BoUpSLP is destructed.
3471 DeletedInstructions.insert(I);
3472 }
3473
3474 /// Remove instructions from the parent function and clear the operands of \p
3475 /// DeadVals instructions, marking for deletion trivially dead operands.
3476 template <typename T>
3478 ArrayRef<T *> DeadVals,
3479 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3481 for (T *V : DeadVals) {
3482 auto *I = cast<Instruction>(V);
3484 }
3485 DenseSet<Value *> Processed;
3486 for (T *V : DeadVals) {
3487 if (!V || !Processed.insert(V).second)
3488 continue;
3489 auto *I = cast<Instruction>(V);
3491 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3492 for (Use &U : I->operands()) {
3493 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3494 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3496 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3497 return Entry->VectorizedValue == OpI;
3498 })))
3499 DeadInsts.push_back(OpI);
3500 }
3501 I->dropAllReferences();
3502 }
3503 for (T *V : DeadVals) {
3504 auto *I = cast<Instruction>(V);
3505 if (!I->getParent())
3506 continue;
3507 assert((I->use_empty() || all_of(I->uses(),
3508 [&](Use &U) {
3509 return isDeleted(
3510 cast<Instruction>(U.getUser()));
3511 })) &&
3512 "trying to erase instruction with users.");
3513 I->removeFromParent();
3514 SE->forgetValue(I);
3515 }
3516 // Process the dead instruction list until empty.
3517 while (!DeadInsts.empty()) {
3518 Value *V = DeadInsts.pop_back_val();
3520 if (!VI || !VI->getParent())
3521 continue;
3523 "Live instruction found in dead worklist!");
3524 assert(VI->use_empty() && "Instructions with uses are not dead.");
3525
3526 // Don't lose the debug info while deleting the instructions.
3527 salvageDebugInfo(*VI);
3528
3529 // Null out all of the instruction's operands to see if any operand
3530 // becomes dead as we go.
3531 for (Use &OpU : VI->operands()) {
3532 Value *OpV = OpU.get();
3533 if (!OpV)
3534 continue;
3535 OpU.set(nullptr);
3536
3537 if (!OpV->use_empty())
3538 continue;
3539
3540 // If the operand is an instruction that became dead as we nulled out
3541 // the operand, and if it is 'trivially' dead, delete it in a future
3542 // loop iteration.
3543 if (auto *OpI = dyn_cast<Instruction>(OpV))
3544 if (!DeletedInstructions.contains(OpI) &&
3545 (!OpI->getType()->isVectorTy() ||
3546 none_of(VectorValuesAndScales,
3547 [&](const std::tuple<Value *, unsigned, bool> &V) {
3548 return std::get<0>(V) == OpI;
3549 })) &&
3551 DeadInsts.push_back(OpI);
3552 }
3553
3554 VI->removeFromParent();
3555 eraseInstruction(VI);
3556 SE->forgetValue(VI);
3557 }
3558 }
3559
3560 /// Checks if the instruction was already analyzed for being possible
3561 /// reduction root.
3563 return AnalyzedReductionsRoots.count(I);
3564 }
3565 /// Register given instruction as already analyzed for being possible
3566 /// reduction root.
3568 AnalyzedReductionsRoots.insert(I);
3569 }
3570 /// Checks if the provided list of reduced values was checked already for
3571 /// vectorization.
3573 return AnalyzedReductionVals.contains(hash_value(VL));
3574 }
3575 /// Adds the list of reduced values to list of already checked values for the
3576 /// vectorization.
3578 AnalyzedReductionVals.insert(hash_value(VL));
3579 }
3580 /// Clear the list of the analyzed reduction root instructions.
3582 AnalyzedReductionsRoots.clear();
3583 AnalyzedReductionVals.clear();
3584 AnalyzedMinBWVals.clear();
3585 }
3586 /// Checks if the given value is gathered in one of the nodes.
3587 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3588 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3589 }
3590 /// Checks if the given value is gathered in one of the nodes.
3591 bool isGathered(const Value *V) const {
3592 return MustGather.contains(V);
3593 }
3594 /// Checks if the specified value was not schedule.
3595 bool isNotScheduled(const Value *V) const {
3596 return NonScheduledFirst.contains(V);
3597 }
3598
3599 /// Check if the value is vectorized in the tree.
3600 bool isVectorized(const Value *V) const {
3601 assert(V && "V cannot be nullptr.");
3602 return ScalarToTreeEntries.contains(V);
3603 }
3604
3605 ~BoUpSLP();
3606
3607private:
3608 /// Determine if a node \p E in can be demoted to a smaller type with a
3609 /// truncation. We collect the entries that will be demoted in ToDemote.
3610 /// \param E Node for analysis
3611 /// \param ToDemote indices of the nodes to be demoted.
3612 bool collectValuesToDemote(
3613 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3615 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3616 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3617
3618 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3619 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3620 /// they have only one user and reordarable).
3621 /// \param ReorderableGathers List of all gather nodes that require reordering
3622 /// (e.g., gather of extractlements or partially vectorizable loads).
3623 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3624 /// reordering, subset of \p NonVectorized.
3625 void buildReorderableOperands(
3626 TreeEntry *UserTE,
3627 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3628 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3629 SmallVectorImpl<TreeEntry *> &GatherOps);
3630
3631 /// Checks if the given \p TE is a gather node with clustered reused scalars
3632 /// and reorders it per given \p Mask.
3633 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3634
3635 /// Checks if all users of \p I are the part of the vectorization tree.
3636 bool areAllUsersVectorized(
3637 Instruction *I,
3638 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3639
3640 /// Return information about the vector formed for the specified index
3641 /// of a vector of (the same) instruction.
3643
3644 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3645 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3646 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3647 return const_cast<TreeEntry *>(
3648 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3649 }
3650
3651 /// Gets the root instruction for the given node. If the node is a strided
3652 /// load/store node with the reverse order, the root instruction is the last
3653 /// one.
3654 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3655
3656 /// \returns Cast context for the given graph node.
3658 getCastContextHint(const TreeEntry &TE) const;
3659
3660 /// \returns the cost of the vectorizable entry.
3661 InstructionCost getEntryCost(const TreeEntry *E,
3662 ArrayRef<Value *> VectorizedVals,
3663 SmallPtrSetImpl<Value *> &CheckedExtracts);
3664
3665 /// Checks if it is legal and profitable to build SplitVectorize node for the
3666 /// given \p VL.
3667 /// \param Op1 first homogeneous scalars.
3668 /// \param Op2 second homogeneous scalars.
3669 /// \param ReorderIndices indices to reorder the scalars.
3670 /// \returns true if the node was successfully built.
3671 bool canBuildSplitNode(ArrayRef<Value *> VL,
3672 const InstructionsState &LocalState,
3675 OrdersType &ReorderIndices) const;
3676
3677 /// This is the recursive part of buildTree.
3678 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3679 unsigned InterleaveFactor = 0);
3680
3681 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3682 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3683 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3684 /// returns false, setting \p CurrentOrder to either an empty vector or a
3685 /// non-identity permutation that allows to reuse extract instructions.
3686 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3687 /// extract order.
3688 bool canReuseExtract(ArrayRef<Value *> VL,
3689 SmallVectorImpl<unsigned> &CurrentOrder,
3690 bool ResizeAllowed = false) const;
3691
3692 /// Vectorize a single entry in the tree.
3693 Value *vectorizeTree(TreeEntry *E);
3694
3695 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3696 /// \p E.
3697 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3698
3699 /// Create a new vector from a list of scalar values. Produces a sequence
3700 /// which exploits values reused across lanes, and arranges the inserts
3701 /// for ease of later optimization.
3702 template <typename BVTy, typename ResTy, typename... Args>
3703 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3704
3705 /// Create a new vector from a list of scalar values. Produces a sequence
3706 /// which exploits values reused across lanes, and arranges the inserts
3707 /// for ease of later optimization.
3708 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3709
3710 /// Returns the instruction in the bundle, which can be used as a base point
3711 /// for scheduling. Usually it is the last instruction in the bundle, except
3712 /// for the case when all operands are external (in this case, it is the first
3713 /// instruction in the list).
3714 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3715
3716 /// Tries to find extractelement instructions with constant indices from fixed
3717 /// vector type and gather such instructions into a bunch, which highly likely
3718 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3719 /// was successful, the matched scalars are replaced by poison values in \p VL
3720 /// for future analysis.
3721 std::optional<TargetTransformInfo::ShuffleKind>
3722 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3723 SmallVectorImpl<int> &Mask) const;
3724
3725 /// Tries to find extractelement instructions with constant indices from fixed
3726 /// vector type and gather such instructions into a bunch, which highly likely
3727 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3728 /// was successful, the matched scalars are replaced by poison values in \p VL
3729 /// for future analysis.
3731 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3733 unsigned NumParts) const;
3734
3735 /// Checks if the gathered \p VL can be represented as a single register
3736 /// shuffle(s) of previous tree entries.
3737 /// \param TE Tree entry checked for permutation.
3738 /// \param VL List of scalars (a subset of the TE scalar), checked for
3739 /// permutations. Must form single-register vector.
3740 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3741 /// commands to build the mask using the original vector value, without
3742 /// relying on the potential reordering.
3743 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3744 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3745 std::optional<TargetTransformInfo::ShuffleKind>
3746 isGatherShuffledSingleRegisterEntry(
3747 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3748 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3749 bool ForOrder);
3750
3751 /// Checks if the gathered \p VL can be represented as multi-register
3752 /// shuffle(s) of previous tree entries.
3753 /// \param TE Tree entry checked for permutation.
3754 /// \param VL List of scalars (a subset of the TE scalar), checked for
3755 /// permutations.
3756 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3757 /// commands to build the mask using the original vector value, without
3758 /// relying on the potential reordering.
3759 /// \returns per-register series of ShuffleKind, if gathered values can be
3760 /// represented as shuffles of previous tree entries. \p Mask is filled with
3761 /// the shuffle mask (also on per-register base).
3763 isGatherShuffledEntry(
3764 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3766 unsigned NumParts, bool ForOrder = false);
3767
3768 /// \returns the cost of gathering (inserting) the values in \p VL into a
3769 /// vector.
3770 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3771 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3772 Type *ScalarTy) const;
3773
3774 /// Set the Builder insert point to one after the last instruction in
3775 /// the bundle
3776 void setInsertPointAfterBundle(const TreeEntry *E);
3777
3778 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3779 /// specified, the starting vector value is poison.
3780 Value *
3781 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3782 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3783
3784 /// \returns whether the VectorizableTree is fully vectorizable and will
3785 /// be beneficial even the tree height is tiny.
3786 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3787
3788 /// Run through the list of all gathered loads in the graph and try to find
3789 /// vector loads/masked gathers instead of regular gathers. Later these loads
3790 /// are reshufled to build final gathered nodes.
3791 void tryToVectorizeGatheredLoads(
3792 const SmallMapVector<
3793 std::tuple<BasicBlock *, Value *, Type *>,
3794 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3795 &GatheredLoads);
3796
3797 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3798 /// users of \p TE and collects the stores. It returns the map from the store
3799 /// pointers to the collected stores.
3801 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3802
3803 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3804 /// stores in \p StoresVec can form a vector instruction. If so it returns
3805 /// true and populates \p ReorderIndices with the shuffle indices of the
3806 /// stores when compared to the sorted vector.
3807 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3808 OrdersType &ReorderIndices) const;
3809
3810 /// Iterates through the users of \p TE, looking for scalar stores that can be
3811 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3812 /// their order and builds an order index vector for each store bundle. It
3813 /// returns all these order vectors found.
3814 /// We run this after the tree has formed, otherwise we may come across user
3815 /// instructions that are not yet in the tree.
3817 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3818
3819 /// Tries to reorder the gathering node for better vectorization
3820 /// opportunities.
3821 void reorderGatherNode(TreeEntry &TE);
3822
3823 class TreeEntry {
3824 public:
3825 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3826 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3827
3828 /// \returns Common mask for reorder indices and reused scalars.
3829 SmallVector<int> getCommonMask() const {
3830 if (State == TreeEntry::SplitVectorize)
3831 return {};
3832 SmallVector<int> Mask;
3833 inversePermutation(ReorderIndices, Mask);
3834 ::addMask(Mask, ReuseShuffleIndices);
3835 return Mask;
3836 }
3837
3838 /// \returns The mask for split nodes.
3839 SmallVector<int> getSplitMask() const {
3840 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3841 "Expected only split vectorize node.");
3842 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3843 unsigned CommonVF = std::max<unsigned>(
3844 CombinedEntriesWithIndices.back().second,
3845 Scalars.size() - CombinedEntriesWithIndices.back().second);
3846 for (auto [Idx, I] : enumerate(ReorderIndices))
3847 Mask[I] =
3848 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3849 ? CommonVF - CombinedEntriesWithIndices.back().second
3850 : 0);
3851 return Mask;
3852 }
3853
3854 /// Updates (reorders) SplitVectorize node according to the given mask \p
3855 /// Mask and order \p MaskOrder.
3856 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3857 ArrayRef<int> MaskOrder);
3858
3859 /// \returns true if the scalars in VL are equal to this entry.
3860 bool isSame(ArrayRef<Value *> VL) const {
3861 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3862 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3863 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3864 return VL.size() == Mask.size() &&
3865 std::equal(VL.begin(), VL.end(), Mask.begin(),
3866 [Scalars](Value *V, int Idx) {
3867 return (isa<UndefValue>(V) &&
3868 Idx == PoisonMaskElem) ||
3869 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3870 });
3871 };
3872 if (!ReorderIndices.empty()) {
3873 // TODO: implement matching if the nodes are just reordered, still can
3874 // treat the vector as the same if the list of scalars matches VL
3875 // directly, without reordering.
3876 SmallVector<int> Mask;
3877 inversePermutation(ReorderIndices, Mask);
3878 if (VL.size() == Scalars.size())
3879 return IsSame(Scalars, Mask);
3880 if (VL.size() == ReuseShuffleIndices.size()) {
3881 ::addMask(Mask, ReuseShuffleIndices);
3882 return IsSame(Scalars, Mask);
3883 }
3884 return false;
3885 }
3886 return IsSame(Scalars, ReuseShuffleIndices);
3887 }
3888
3889 /// \returns true if current entry has same operands as \p TE.
3890 bool hasEqualOperands(const TreeEntry &TE) const {
3891 if (TE.getNumOperands() != getNumOperands())
3892 return false;
3893 SmallBitVector Used(getNumOperands());
3894 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3895 unsigned PrevCount = Used.count();
3896 for (unsigned K = 0; K < E; ++K) {
3897 if (Used.test(K))
3898 continue;
3899 if (getOperand(K) == TE.getOperand(I)) {
3900 Used.set(K);
3901 break;
3902 }
3903 }
3904 // Check if we actually found the matching operand.
3905 if (PrevCount == Used.count())
3906 return false;
3907 }
3908 return true;
3909 }
3910
3911 /// \return Final vectorization factor for the node. Defined by the total
3912 /// number of vectorized scalars, including those, used several times in the
3913 /// entry and counted in the \a ReuseShuffleIndices, if any.
3914 unsigned getVectorFactor() const {
3915 if (!ReuseShuffleIndices.empty())
3916 return ReuseShuffleIndices.size();
3917 return Scalars.size();
3918 };
3919
3920 /// Checks if the current node is a gather node.
3921 bool isGather() const { return State == NeedToGather; }
3922
3923 /// A vector of scalars.
3924 ValueList Scalars;
3925
3926 /// The Scalars are vectorized into this value. It is initialized to Null.
3927 WeakTrackingVH VectorizedValue = nullptr;
3928
3929 /// Do we need to gather this sequence or vectorize it
3930 /// (either with vector instruction or with scatter/gather
3931 /// intrinsics for store/load)?
3932 enum EntryState {
3933 Vectorize, ///< The node is regularly vectorized.
3934 ScatterVectorize, ///< Masked scatter/gather node.
3935 StridedVectorize, ///< Strided loads (and stores)
3936 CompressVectorize, ///< (Masked) load with compress.
3937 NeedToGather, ///< Gather/buildvector node.
3938 CombinedVectorize, ///< Vectorized node, combined with its user into more
3939 ///< complex node like select/cmp to minmax, mul/add to
3940 ///< fma, etc. Must be used for the following nodes in
3941 ///< the pattern, not the very first one.
3942 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
3943 ///< independently and then combines back.
3944 };
3945 EntryState State;
3946
3947 /// List of combined opcodes supported by the vectorizer.
3948 enum CombinedOpcode {
3949 NotCombinedOp = -1,
3950 MinMax = Instruction::OtherOpsEnd + 1,
3951 FMulAdd,
3952 };
3953 CombinedOpcode CombinedOp = NotCombinedOp;
3954
3955 /// Does this sequence require some shuffling?
3956 SmallVector<int, 4> ReuseShuffleIndices;
3957
3958 /// Does this entry require reordering?
3959 SmallVector<unsigned, 4> ReorderIndices;
3960
3961 /// Points back to the VectorizableTree.
3962 ///
3963 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3964 /// to be a pointer and needs to be able to initialize the child iterator.
3965 /// Thus we need a reference back to the container to translate the indices
3966 /// to entries.
3967 VecTreeTy &Container;
3968
3969 /// The TreeEntry index containing the user of this entry.
3970 EdgeInfo UserTreeIndex;
3971
3972 /// The index of this treeEntry in VectorizableTree.
3973 unsigned Idx = 0;
3974
3975 /// For gather/buildvector/alt opcode nodes, which are combined from
3976 /// other nodes as a series of insertvector instructions.
3977 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3978
3979 private:
3980 /// The operands of each instruction in each lane Operands[op_index][lane].
3981 /// Note: This helps avoid the replication of the code that performs the
3982 /// reordering of operands during buildTreeRec() and vectorizeTree().
3984
3985 /// Copyable elements of the entry node.
3986 SmallPtrSet<const Value *, 4> CopyableElements;
3987
3988 /// MainOp and AltOp are recorded inside. S should be obtained from
3989 /// newTreeEntry.
3990 InstructionsState S = InstructionsState::invalid();
3991
3992 /// Interleaving factor for interleaved loads Vectorize nodes.
3993 unsigned InterleaveFactor = 0;
3994
3995 /// True if the node does not require scheduling.
3996 bool DoesNotNeedToSchedule = false;
3997
3998 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3999 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4000 if (Operands.size() < OpIdx + 1)
4001 Operands.resize(OpIdx + 1);
4002 assert(Operands[OpIdx].empty() && "Already resized?");
4003 assert(OpVL.size() <= Scalars.size() &&
4004 "Number of operands is greater than the number of scalars.");
4005 Operands[OpIdx].resize(OpVL.size());
4006 copy(OpVL, Operands[OpIdx].begin());
4007 }
4008
4009 public:
4010 /// Returns interleave factor for interleave nodes.
4011 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4012 /// Sets interleaving factor for the interleaving nodes.
4013 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4014
4015 /// Marks the node as one that does not require scheduling.
4016 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4017 /// Returns true if the node is marked as one that does not require
4018 /// scheduling.
4019 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4020
4021 /// Set this bundle's operands from \p Operands.
4022 void setOperands(ArrayRef<ValueList> Operands) {
4023 for (unsigned I : seq<unsigned>(Operands.size()))
4024 setOperand(I, Operands[I]);
4025 }
4026
4027 /// Reorders operands of the node to the given mask \p Mask.
4028 void reorderOperands(ArrayRef<int> Mask) {
4029 for (ValueList &Operand : Operands)
4030 reorderScalars(Operand, Mask);
4031 }
4032
4033 /// \returns the \p OpIdx operand of this TreeEntry.
4034 ValueList &getOperand(unsigned OpIdx) {
4035 assert(OpIdx < Operands.size() && "Off bounds");
4036 return Operands[OpIdx];
4037 }
4038
4039 /// \returns the \p OpIdx operand of this TreeEntry.
4040 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4041 assert(OpIdx < Operands.size() && "Off bounds");
4042 return Operands[OpIdx];
4043 }
4044
4045 /// \returns the number of operands.
4046 unsigned getNumOperands() const { return Operands.size(); }
4047
4048 /// \return the single \p OpIdx operand.
4049 Value *getSingleOperand(unsigned OpIdx) const {
4050 assert(OpIdx < Operands.size() && "Off bounds");
4051 assert(!Operands[OpIdx].empty() && "No operand available");
4052 return Operands[OpIdx][0];
4053 }
4054
4055 /// Some of the instructions in the list have alternate opcodes.
4056 bool isAltShuffle() const { return S.isAltShuffle(); }
4057
4058 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4059 return S.getMatchingMainOpOrAltOp(I);
4060 }
4061
4062 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4063 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4064 /// \p OpValue.
4065 Value *isOneOf(Value *Op) const {
4066 auto *I = dyn_cast<Instruction>(Op);
4067 if (I && getMatchingMainOpOrAltOp(I))
4068 return Op;
4069 return S.getMainOp();
4070 }
4071
4072 void setOperations(const InstructionsState &S) {
4073 assert(S && "InstructionsState is invalid.");
4074 this->S = S;
4075 }
4076
4077 Instruction *getMainOp() const { return S.getMainOp(); }
4078
4079 Instruction *getAltOp() const { return S.getAltOp(); }
4080
4081 /// The main/alternate opcodes for the list of instructions.
4082 unsigned getOpcode() const { return S.getOpcode(); }
4083
4084 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4085
4086 bool hasState() const { return S.valid(); }
4087
4088 /// Add \p V to the list of copyable elements.
4089 void addCopyableElement(Value *V) {
4090 assert(S.isCopyableElement(V) && "Not a copyable element.");
4091 CopyableElements.insert(V);
4092 }
4093
4094 /// Returns true if \p V is a copyable element.
4095 bool isCopyableElement(Value *V) const {
4096 return CopyableElements.contains(V);
4097 }
4098
4099 /// Returns true if any scalar in the list is a copyable element.
4100 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4101
4102 /// Returns the state of the operations.
4103 const InstructionsState &getOperations() const { return S; }
4104
4105 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4106 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4107 unsigned findLaneForValue(Value *V) const {
4108 unsigned FoundLane = getVectorFactor();
4109 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4110 std::advance(It, 1)) {
4111 if (*It != V)
4112 continue;
4113 FoundLane = std::distance(Scalars.begin(), It);
4114 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4115 if (!ReorderIndices.empty())
4116 FoundLane = ReorderIndices[FoundLane];
4117 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4118 if (ReuseShuffleIndices.empty())
4119 break;
4120 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4121 RIt != ReuseShuffleIndices.end()) {
4122 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4123 break;
4124 }
4125 }
4126 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4127 return FoundLane;
4128 }
4129
4130 /// Build a shuffle mask for graph entry which represents a merge of main
4131 /// and alternate operations.
4132 void
4133 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4134 SmallVectorImpl<int> &Mask,
4135 SmallVectorImpl<Value *> *OpScalars = nullptr,
4136 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4137
4138 /// Return true if this is a non-power-of-2 node.
4139 bool isNonPowOf2Vec() const {
4140 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4141 return IsNonPowerOf2;
4142 }
4143
4144 /// Return true if this is a node, which tries to vectorize number of
4145 /// elements, forming whole vectors.
4146 bool
4147 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4148 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4149 TTI, getValueType(Scalars.front()), Scalars.size());
4150 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4151 "Reshuffling not supported with non-power-of-2 vectors yet.");
4152 return IsNonPowerOf2;
4153 }
4154
4155 Value *getOrdered(unsigned Idx) const {
4156 assert(isGather() && "Must be used only for buildvectors/gathers.");
4157 if (ReorderIndices.empty())
4158 return Scalars[Idx];
4159 SmallVector<int> Mask;
4160 inversePermutation(ReorderIndices, Mask);
4161 return Scalars[Mask[Idx]];
4162 }
4163
4164#ifndef NDEBUG
4165 /// Debug printer.
4166 LLVM_DUMP_METHOD void dump() const {
4167 dbgs() << Idx << ".\n";
4168 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4169 dbgs() << "Operand " << OpI << ":\n";
4170 for (const Value *V : Operands[OpI])
4171 dbgs().indent(2) << *V << "\n";
4172 }
4173 dbgs() << "Scalars: \n";
4174 for (Value *V : Scalars)
4175 dbgs().indent(2) << *V << "\n";
4176 dbgs() << "State: ";
4177 if (S && hasCopyableElements())
4178 dbgs() << "[[Copyable]] ";
4179 switch (State) {
4180 case Vectorize:
4181 if (InterleaveFactor > 0) {
4182 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4183 << "\n";
4184 } else {
4185 dbgs() << "Vectorize\n";
4186 }
4187 break;
4188 case ScatterVectorize:
4189 dbgs() << "ScatterVectorize\n";
4190 break;
4191 case StridedVectorize:
4192 dbgs() << "StridedVectorize\n";
4193 break;
4194 case CompressVectorize:
4195 dbgs() << "CompressVectorize\n";
4196 break;
4197 case NeedToGather:
4198 dbgs() << "NeedToGather\n";
4199 break;
4200 case CombinedVectorize:
4201 dbgs() << "CombinedVectorize\n";
4202 break;
4203 case SplitVectorize:
4204 dbgs() << "SplitVectorize\n";
4205 break;
4206 }
4207 if (S) {
4208 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4209 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4210 } else {
4211 dbgs() << "MainOp: NULL\n";
4212 dbgs() << "AltOp: NULL\n";
4213 }
4214 dbgs() << "VectorizedValue: ";
4215 if (VectorizedValue)
4216 dbgs() << *VectorizedValue << "\n";
4217 else
4218 dbgs() << "NULL\n";
4219 dbgs() << "ReuseShuffleIndices: ";
4220 if (ReuseShuffleIndices.empty())
4221 dbgs() << "Empty";
4222 else
4223 for (int ReuseIdx : ReuseShuffleIndices)
4224 dbgs() << ReuseIdx << ", ";
4225 dbgs() << "\n";
4226 dbgs() << "ReorderIndices: ";
4227 for (unsigned ReorderIdx : ReorderIndices)
4228 dbgs() << ReorderIdx << ", ";
4229 dbgs() << "\n";
4230 dbgs() << "UserTreeIndex: ";
4231 if (UserTreeIndex)
4232 dbgs() << UserTreeIndex;
4233 else
4234 dbgs() << "<invalid>";
4235 dbgs() << "\n";
4236 if (!CombinedEntriesWithIndices.empty()) {
4237 dbgs() << "Combined entries: ";
4238 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4239 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4240 });
4241 dbgs() << "\n";
4242 }
4243 }
4244#endif
4245 };
4246
4247#ifndef NDEBUG
4248 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4249 InstructionCost VecCost, InstructionCost ScalarCost,
4250 StringRef Banner) const {
4251 dbgs() << "SLP: " << Banner << ":\n";
4252 E->dump();
4253 dbgs() << "SLP: Costs:\n";
4254 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4255 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4256 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4257 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4258 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4259 }
4260#endif
4261
4262 /// Create a new gather TreeEntry
4263 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4264 const InstructionsState &S,
4265 const EdgeInfo &UserTreeIdx,
4266 ArrayRef<int> ReuseShuffleIndices = {}) {
4267 auto Invalid = ScheduleBundle::invalid();
4268 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4269 }
4270
4271 /// Create a new VectorizableTree entry.
4272 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4273 const InstructionsState &S,
4274 const EdgeInfo &UserTreeIdx,
4275 ArrayRef<int> ReuseShuffleIndices = {},
4276 ArrayRef<unsigned> ReorderIndices = {},
4277 unsigned InterleaveFactor = 0) {
4278 TreeEntry::EntryState EntryState =
4279 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4280 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4281 ReuseShuffleIndices, ReorderIndices);
4282 if (E && InterleaveFactor > 0)
4283 E->setInterleave(InterleaveFactor);
4284 return E;
4285 }
4286
4287 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4288 TreeEntry::EntryState EntryState,
4289 ScheduleBundle &Bundle, const InstructionsState &S,
4290 const EdgeInfo &UserTreeIdx,
4291 ArrayRef<int> ReuseShuffleIndices = {},
4292 ArrayRef<unsigned> ReorderIndices = {}) {
4293 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4294 EntryState == TreeEntry::SplitVectorize)) ||
4295 (Bundle && EntryState != TreeEntry::NeedToGather &&
4296 EntryState != TreeEntry::SplitVectorize)) &&
4297 "Need to vectorize gather entry?");
4298 // Gathered loads still gathered? Do not create entry, use the original one.
4299 if (GatheredLoadsEntriesFirst.has_value() &&
4300 EntryState == TreeEntry::NeedToGather && S &&
4301 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4302 !UserTreeIdx.UserTE)
4303 return nullptr;
4304 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4305 TreeEntry *Last = VectorizableTree.back().get();
4306 Last->Idx = VectorizableTree.size() - 1;
4307 Last->State = EntryState;
4308 if (UserTreeIdx.UserTE)
4309 OperandsToTreeEntry.try_emplace(
4310 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4311 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4312 // for non-power-of-two vectors.
4313 assert(
4314 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4315 ReuseShuffleIndices.empty()) &&
4316 "Reshuffling scalars not yet supported for nodes with padding");
4317 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4318 ReuseShuffleIndices.end());
4319 if (ReorderIndices.empty()) {
4320 Last->Scalars.assign(VL.begin(), VL.end());
4321 if (S)
4322 Last->setOperations(S);
4323 } else {
4324 // Reorder scalars and build final mask.
4325 Last->Scalars.assign(VL.size(), nullptr);
4326 transform(ReorderIndices, Last->Scalars.begin(),
4327 [VL](unsigned Idx) -> Value * {
4328 if (Idx >= VL.size())
4329 return UndefValue::get(VL.front()->getType());
4330 return VL[Idx];
4331 });
4332 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4333 if (S)
4334 Last->setOperations(S);
4335 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4336 }
4337 if (EntryState == TreeEntry::SplitVectorize) {
4338 assert(S && "Split nodes must have operations.");
4339 Last->setOperations(S);
4340 SmallPtrSet<Value *, 4> Processed;
4341 for (Value *V : VL) {
4342 auto *I = dyn_cast<Instruction>(V);
4343 if (!I)
4344 continue;
4345 auto It = ScalarsInSplitNodes.find(V);
4346 if (It == ScalarsInSplitNodes.end()) {
4347 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4348 (void)Processed.insert(V);
4349 } else if (Processed.insert(V).second) {
4350 assert(!is_contained(It->getSecond(), Last) &&
4351 "Value already associated with the node.");
4352 It->getSecond().push_back(Last);
4353 }
4354 }
4355 } else if (!Last->isGather()) {
4356 if (isa<PHINode>(S.getMainOp()) ||
4357 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4358 (!S.areInstructionsWithCopyableElements() &&
4359 doesNotNeedToSchedule(VL)) ||
4360 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4361 Last->setDoesNotNeedToSchedule();
4362 SmallPtrSet<Value *, 4> Processed;
4363 for (Value *V : VL) {
4364 if (isa<PoisonValue>(V))
4365 continue;
4366 if (S.isCopyableElement(V)) {
4367 Last->addCopyableElement(V);
4368 continue;
4369 }
4370 auto It = ScalarToTreeEntries.find(V);
4371 if (It == ScalarToTreeEntries.end()) {
4372 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4373 (void)Processed.insert(V);
4374 } else if (Processed.insert(V).second) {
4375 assert(!is_contained(It->getSecond(), Last) &&
4376 "Value already associated with the node.");
4377 It->getSecond().push_back(Last);
4378 }
4379 }
4380 // Update the scheduler bundle to point to this TreeEntry.
4381 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4382 "Bundle and VL out of sync");
4383 if (!Bundle.getBundle().empty()) {
4384#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4385 auto *BundleMember = Bundle.getBundle().begin();
4386 SmallPtrSet<Value *, 4> Processed;
4387 for (Value *V : VL) {
4388 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4389 continue;
4390 ++BundleMember;
4391 }
4392 assert(BundleMember == Bundle.getBundle().end() &&
4393 "Bundle and VL out of sync");
4394#endif
4395 Bundle.setTreeEntry(Last);
4396 }
4397 } else {
4398 // Build a map for gathered scalars to the nodes where they are used.
4399 bool AllConstsOrCasts = true;
4400 for (Value *V : VL) {
4401 if (S && S.areInstructionsWithCopyableElements() &&
4402 S.isCopyableElement(V))
4403 Last->addCopyableElement(V);
4404 if (!isConstant(V)) {
4405 auto *I = dyn_cast<CastInst>(V);
4406 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4407 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4408 !UserTreeIdx.UserTE->isGather())
4409 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4410 }
4411 }
4412 if (AllConstsOrCasts)
4413 CastMaxMinBWSizes =
4414 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4415 MustGather.insert_range(VL);
4416 }
4417
4418 if (UserTreeIdx.UserTE)
4419 Last->UserTreeIndex = UserTreeIdx;
4420 return Last;
4421 }
4422
4423 /// -- Vectorization State --
4424 /// Holds all of the tree entries.
4425 TreeEntry::VecTreeTy VectorizableTree;
4426
4427#ifndef NDEBUG
4428 /// Debug printer.
4429 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4430 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4431 VectorizableTree[Id]->dump();
4432 dbgs() << "\n";
4433 }
4434 }
4435#endif
4436
4437 /// Get list of vector entries, associated with the value \p V.
4438 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4439 assert(V && "V cannot be nullptr.");
4440 auto It = ScalarToTreeEntries.find(V);
4441 if (It == ScalarToTreeEntries.end())
4442 return {};
4443 return It->getSecond();
4444 }
4445
4446 /// Get list of split vector entries, associated with the value \p V.
4447 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4448 assert(V && "V cannot be nullptr.");
4449 auto It = ScalarsInSplitNodes.find(V);
4450 if (It == ScalarsInSplitNodes.end())
4451 return {};
4452 return It->getSecond();
4453 }
4454
4455 /// Returns first vector node for value \p V, matching values \p VL.
4456 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4457 bool SameVF = false) const {
4458 assert(V && "V cannot be nullptr.");
4459 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4460 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4461 return TE;
4462 return nullptr;
4463 }
4464
4465 /// Check that the operand node of alternate node does not generate
4466 /// buildvector sequence. If it is, then probably not worth it to build
4467 /// alternate shuffle, if number of buildvector operands + alternate
4468 /// instruction > than the number of buildvector instructions.
4469 /// \param S the instructions state of the analyzed values.
4470 /// \param VL list of the instructions with alternate opcodes.
4471 bool areAltOperandsProfitable(const InstructionsState &S,
4472 ArrayRef<Value *> VL) const;
4473
4474 /// Contains all the outputs of legality analysis for a list of values to
4475 /// vectorize.
4476 class ScalarsVectorizationLegality {
4477 InstructionsState S;
4478 bool IsLegal;
4479 bool TryToFindDuplicates;
4480 bool TrySplitVectorize;
4481
4482 public:
4483 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4484 bool TryToFindDuplicates = true,
4485 bool TrySplitVectorize = false)
4486 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4487 TrySplitVectorize(TrySplitVectorize) {
4488 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4489 "Inconsistent state");
4490 }
4491 const InstructionsState &getInstructionsState() const { return S; };
4492 bool isLegal() const { return IsLegal; }
4493 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4494 bool trySplitVectorize() const { return TrySplitVectorize; }
4495 };
4496
4497 /// Checks if the specified list of the instructions/values can be vectorized
4498 /// in general.
4499 ScalarsVectorizationLegality
4500 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4501 const EdgeInfo &UserTreeIdx,
4502 bool TryCopyableElementsVectorization) const;
4503
4504 /// Checks if the specified list of the instructions/values can be vectorized
4505 /// and fills required data before actual scheduling of the instructions.
4506 TreeEntry::EntryState getScalarsVectorizationState(
4507 const InstructionsState &S, ArrayRef<Value *> VL,
4508 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4509 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4510
4511 /// Maps a specific scalar to its tree entry(ies).
4512 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4513
4514 /// Maps the operand index and entry to the corresponding tree entry.
4515 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4516 OperandsToTreeEntry;
4517
4518 /// Scalars, used in split vectorize nodes.
4519 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4520
4521 /// Maps a value to the proposed vectorizable size.
4522 SmallDenseMap<Value *, unsigned> InstrElementSize;
4523
4524 /// A list of scalars that we found that we need to keep as scalars.
4525 ValueSet MustGather;
4526
4527 /// A set of first non-schedulable values.
4528 ValueSet NonScheduledFirst;
4529
4530 /// A map between the vectorized entries and the last instructions in the
4531 /// bundles. The bundles are built in use order, not in the def order of the
4532 /// instructions. So, we cannot rely directly on the last instruction in the
4533 /// bundle being the last instruction in the program order during
4534 /// vectorization process since the basic blocks are affected, need to
4535 /// pre-gather them before.
4536 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4537
4538 /// List of gather nodes, depending on other gather/vector nodes, which should
4539 /// be emitted after the vector instruction emission process to correctly
4540 /// handle order of the vector instructions and shuffles.
4541 SetVector<const TreeEntry *> PostponedGathers;
4542
4543 using ValueToGatherNodesMap =
4544 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4545 ValueToGatherNodesMap ValueToGatherNodes;
4546
4547 /// A list of the load entries (node indices), which can be vectorized using
4548 /// strided or masked gather approach, but attempted to be represented as
4549 /// contiguous loads.
4550 SetVector<unsigned> LoadEntriesToVectorize;
4551
4552 /// true if graph nodes transforming mode is on.
4553 bool IsGraphTransformMode = false;
4554
4555 /// The index of the first gathered load entry in the VectorizeTree.
4556 std::optional<unsigned> GatheredLoadsEntriesFirst;
4557
4558 /// Maps compress entries to their mask data for the final codegen.
4559 SmallDenseMap<const TreeEntry *,
4560 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4561 CompressEntryToData;
4562
4563 /// This POD struct describes one external user in the vectorized tree.
4564 struct ExternalUser {
4565 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4566 : Scalar(S), User(U), E(E), Lane(L) {}
4567
4568 /// Which scalar in our function.
4569 Value *Scalar = nullptr;
4570
4571 /// Which user that uses the scalar.
4572 llvm::User *User = nullptr;
4573
4574 /// Vector node, the value is part of.
4575 const TreeEntry &E;
4576
4577 /// Which lane does the scalar belong to.
4578 unsigned Lane;
4579 };
4580 using UserList = SmallVector<ExternalUser, 16>;
4581
4582 /// Checks if two instructions may access the same memory.
4583 ///
4584 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4585 /// is invariant in the calling loop.
4586 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4587 Instruction *Inst2) {
4588 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4589 // First check if the result is already in the cache.
4590 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4591 auto Res = AliasCache.try_emplace(Key);
4592 if (!Res.second)
4593 return Res.first->second;
4594 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4595 // Store the result in the cache.
4596 Res.first->getSecond() = Aliased;
4597 return Aliased;
4598 }
4599
4600 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4601
4602 /// Cache for alias results.
4603 /// TODO: consider moving this to the AliasAnalysis itself.
4604 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4605
4606 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4607 // globally through SLP because we don't perform any action which
4608 // invalidates capture results.
4609 BatchAAResults BatchAA;
4610
4611 /// Temporary store for deleted instructions. Instructions will be deleted
4612 /// eventually when the BoUpSLP is destructed. The deferral is required to
4613 /// ensure that there are no incorrect collisions in the AliasCache, which
4614 /// can happen if a new instruction is allocated at the same address as a
4615 /// previously deleted instruction.
4616 DenseSet<Instruction *> DeletedInstructions;
4617
4618 /// Set of the instruction, being analyzed already for reductions.
4619 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4620
4621 /// Set of hashes for the list of reduction values already being analyzed.
4622 DenseSet<size_t> AnalyzedReductionVals;
4623
4624 /// Values, already been analyzed for mininmal bitwidth and found to be
4625 /// non-profitable.
4626 DenseSet<Value *> AnalyzedMinBWVals;
4627
4628 /// A list of values that need to extracted out of the tree.
4629 /// This list holds pairs of (Internal Scalar : External User). External User
4630 /// can be nullptr, it means that this Internal Scalar will be used later,
4631 /// after vectorization.
4632 UserList ExternalUses;
4633
4634 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4635 /// extractelement instructions.
4636 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4637
4638 /// A list of scalar to be extracted without specific user necause of too many
4639 /// uses.
4640 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4641
4642 /// Values used only by @llvm.assume calls.
4643 SmallPtrSet<const Value *, 32> EphValues;
4644
4645 /// Holds all of the instructions that we gathered, shuffle instructions and
4646 /// extractelements.
4647 SetVector<Instruction *> GatherShuffleExtractSeq;
4648
4649 /// A list of blocks that we are going to CSE.
4650 DenseSet<BasicBlock *> CSEBlocks;
4651
4652 /// List of hashes of vector of loads, which are known to be non vectorizable.
4653 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4654
4655 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4656 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4657 /// instructions, while ScheduleBundle represents a batch of instructions,
4658 /// going to be groupped together. ScheduleCopyableData models extra user for
4659 /// "copyable" instructions.
4660 class ScheduleEntity {
4661 friend class ScheduleBundle;
4662 friend class ScheduleData;
4663 friend class ScheduleCopyableData;
4664
4665 protected:
4666 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4667 Kind getKind() const { return K; }
4668 ScheduleEntity(Kind K) : K(K) {}
4669
4670 private:
4671 /// Used for getting a "good" final ordering of instructions.
4672 int SchedulingPriority = 0;
4673 /// True if this instruction (or bundle) is scheduled (or considered as
4674 /// scheduled in the dry-run).
4675 bool IsScheduled = false;
4676 /// The kind of the ScheduleEntity.
4677 const Kind K = Kind::ScheduleData;
4678
4679 public:
4680 ScheduleEntity() = delete;
4681 /// Gets/sets the scheduling priority.
4682 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4683 int getSchedulingPriority() const { return SchedulingPriority; }
4684 bool isReady() const {
4685 if (const auto *SD = dyn_cast<ScheduleData>(this))
4686 return SD->isReady();
4687 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4688 return CD->isReady();
4689 return cast<ScheduleBundle>(this)->isReady();
4690 }
4691 /// Returns true if the dependency information has been calculated.
4692 /// Note that depenendency validity can vary between instructions within
4693 /// a single bundle.
4694 bool hasValidDependencies() const {
4695 if (const auto *SD = dyn_cast<ScheduleData>(this))
4696 return SD->hasValidDependencies();
4697 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4698 return CD->hasValidDependencies();
4699 return cast<ScheduleBundle>(this)->hasValidDependencies();
4700 }
4701 /// Gets the number of unscheduled dependencies.
4702 int getUnscheduledDeps() const {
4703 if (const auto *SD = dyn_cast<ScheduleData>(this))
4704 return SD->getUnscheduledDeps();
4705 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4706 return CD->getUnscheduledDeps();
4707 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4708 }
4709 /// Increments the number of unscheduled dependencies.
4710 int incrementUnscheduledDeps(int Incr) {
4711 if (auto *SD = dyn_cast<ScheduleData>(this))
4712 return SD->incrementUnscheduledDeps(Incr);
4713 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4714 }
4715 /// Gets the number of dependencies.
4716 int getDependencies() const {
4717 if (const auto *SD = dyn_cast<ScheduleData>(this))
4718 return SD->getDependencies();
4719 return cast<ScheduleCopyableData>(this)->getDependencies();
4720 }
4721 /// Gets the instruction.
4722 Instruction *getInst() const {
4723 if (const auto *SD = dyn_cast<ScheduleData>(this))
4724 return SD->getInst();
4725 return cast<ScheduleCopyableData>(this)->getInst();
4726 }
4727
4728 /// Gets/sets if the bundle is scheduled.
4729 bool isScheduled() const { return IsScheduled; }
4730 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4731
4732 static bool classof(const ScheduleEntity *) { return true; }
4733
4734#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4735 void dump(raw_ostream &OS) const {
4736 if (const auto *SD = dyn_cast<ScheduleData>(this))
4737 return SD->dump(OS);
4738 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4739 return CD->dump(OS);
4740 return cast<ScheduleBundle>(this)->dump(OS);
4741 }
4742
4743 LLVM_DUMP_METHOD void dump() const {
4744 dump(dbgs());
4745 dbgs() << '\n';
4746 }
4747#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4748 };
4749
4750#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4752 const BoUpSLP::ScheduleEntity &SE) {
4753 SE.dump(OS);
4754 return OS;
4755 }
4756#endif
4757
4758 /// Contains all scheduling relevant data for an instruction.
4759 /// A ScheduleData either represents a single instruction or a member of an
4760 /// instruction bundle (= a group of instructions which is combined into a
4761 /// vector instruction).
4762 class ScheduleData final : public ScheduleEntity {
4763 public:
4764 // The initial value for the dependency counters. It means that the
4765 // dependencies are not calculated yet.
4766 enum { InvalidDeps = -1 };
4767
4768 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4769 static bool classof(const ScheduleEntity *Entity) {
4770 return Entity->getKind() == Kind::ScheduleData;
4771 }
4772
4773 void init(int BlockSchedulingRegionID, Instruction *I) {
4774 NextLoadStore = nullptr;
4775 IsScheduled = false;
4776 SchedulingRegionID = BlockSchedulingRegionID;
4777 clearDependencies();
4778 Inst = I;
4779 }
4780
4781 /// Verify basic self consistency properties
4782 void verify() {
4783 if (hasValidDependencies()) {
4784 assert(UnscheduledDeps <= Dependencies && "invariant");
4785 } else {
4786 assert(UnscheduledDeps == Dependencies && "invariant");
4787 }
4788
4789 if (IsScheduled) {
4790 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4791 "unexpected scheduled state");
4792 }
4793 }
4794
4795 /// Returns true if the dependency information has been calculated.
4796 /// Note that depenendency validity can vary between instructions within
4797 /// a single bundle.
4798 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4799
4800 /// Returns true if it is ready for scheduling, i.e. it has no more
4801 /// unscheduled depending instructions/bundles.
4802 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4803
4804 /// Modifies the number of unscheduled dependencies for this instruction,
4805 /// and returns the number of remaining dependencies for the containing
4806 /// bundle.
4807 int incrementUnscheduledDeps(int Incr) {
4808 assert(hasValidDependencies() &&
4809 "increment of unscheduled deps would be meaningless");
4810 UnscheduledDeps += Incr;
4811 return UnscheduledDeps;
4812 }
4813
4814 /// Sets the number of unscheduled dependencies to the number of
4815 /// dependencies.
4816 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4817
4818 /// Clears all dependency information.
4819 void clearDependencies() {
4820 clearDirectDependencies();
4821 MemoryDependencies.clear();
4822 ControlDependencies.clear();
4823 }
4824
4825 /// Clears all direct dependencies only, except for control and memory
4826 /// dependencies.
4827 /// Required for copyable elements to correctly handle control/memory deps
4828 /// and avoid extra reclaculation of such deps.
4829 void clearDirectDependencies() {
4830 Dependencies = InvalidDeps;
4831 resetUnscheduledDeps();
4832 IsScheduled = false;
4833 }
4834
4835 /// Gets the number of unscheduled dependencies.
4836 int getUnscheduledDeps() const { return UnscheduledDeps; }
4837 /// Gets the number of dependencies.
4838 int getDependencies() const { return Dependencies; }
4839 /// Initializes the number of dependencies.
4840 void initDependencies() { Dependencies = 0; }
4841 /// Increments the number of dependencies.
4842 void incDependencies() { Dependencies++; }
4843
4844 /// Gets scheduling region ID.
4845 int getSchedulingRegionID() const { return SchedulingRegionID; }
4846
4847 /// Gets the instruction.
4848 Instruction *getInst() const { return Inst; }
4849
4850 /// Gets the list of memory dependencies.
4851 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4852 return MemoryDependencies;
4853 }
4854 /// Adds a memory dependency.
4855 void addMemoryDependency(ScheduleData *Dep) {
4856 MemoryDependencies.push_back(Dep);
4857 }
4858 /// Gets the list of control dependencies.
4859 ArrayRef<ScheduleData *> getControlDependencies() const {
4860 return ControlDependencies;
4861 }
4862 /// Adds a control dependency.
4863 void addControlDependency(ScheduleData *Dep) {
4864 ControlDependencies.push_back(Dep);
4865 }
4866 /// Gets/sets the next load/store instruction in the block.
4867 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4868 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4869
4870 void dump(raw_ostream &OS) const { OS << *Inst; }
4871
4872 LLVM_DUMP_METHOD void dump() const {
4873 dump(dbgs());
4874 dbgs() << '\n';
4875 }
4876
4877 private:
4878 Instruction *Inst = nullptr;
4879
4880 /// Single linked list of all memory instructions (e.g. load, store, call)
4881 /// in the block - until the end of the scheduling region.
4882 ScheduleData *NextLoadStore = nullptr;
4883
4884 /// The dependent memory instructions.
4885 /// This list is derived on demand in calculateDependencies().
4886 SmallVector<ScheduleData *> MemoryDependencies;
4887
4888 /// List of instructions which this instruction could be control dependent
4889 /// on. Allowing such nodes to be scheduled below this one could introduce
4890 /// a runtime fault which didn't exist in the original program.
4891 /// ex: this is a load or udiv following a readonly call which inf loops
4892 SmallVector<ScheduleData *> ControlDependencies;
4893
4894 /// This ScheduleData is in the current scheduling region if this matches
4895 /// the current SchedulingRegionID of BlockScheduling.
4896 int SchedulingRegionID = 0;
4897
4898 /// The number of dependencies. Constitutes of the number of users of the
4899 /// instruction plus the number of dependent memory instructions (if any).
4900 /// This value is calculated on demand.
4901 /// If InvalidDeps, the number of dependencies is not calculated yet.
4902 int Dependencies = InvalidDeps;
4903
4904 /// The number of dependencies minus the number of dependencies of scheduled
4905 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4906 /// for scheduling.
4907 /// Note that this is negative as long as Dependencies is not calculated.
4908 int UnscheduledDeps = InvalidDeps;
4909 };
4910
4911#ifndef NDEBUG
4913 const BoUpSLP::ScheduleData &SD) {
4914 SD.dump(OS);
4915 return OS;
4916 }
4917#endif
4918
4919 class ScheduleBundle final : public ScheduleEntity {
4920 /// The schedule data for the instructions in the bundle.
4922 /// True if this bundle is valid.
4923 bool IsValid = true;
4924 /// The TreeEntry that this instruction corresponds to.
4925 TreeEntry *TE = nullptr;
4926 ScheduleBundle(bool IsValid)
4927 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4928
4929 public:
4930 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4931 static bool classof(const ScheduleEntity *Entity) {
4932 return Entity->getKind() == Kind::ScheduleBundle;
4933 }
4934
4935 /// Verify basic self consistency properties
4936 void verify() const {
4937 for (const ScheduleEntity *SD : Bundle) {
4938 if (SD->hasValidDependencies()) {
4939 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4940 "invariant");
4941 } else {
4942 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4943 "invariant");
4944 }
4945
4946 if (isScheduled()) {
4947 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4948 "unexpected scheduled state");
4949 }
4950 }
4951 }
4952
4953 /// Returns the number of unscheduled dependencies in the bundle.
4954 int unscheduledDepsInBundle() const {
4955 assert(*this && "bundle must not be empty");
4956 int Sum = 0;
4957 for (const ScheduleEntity *BundleMember : Bundle) {
4958 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4959 return ScheduleData::InvalidDeps;
4960 Sum += BundleMember->getUnscheduledDeps();
4961 }
4962 return Sum;
4963 }
4964
4965 /// Returns true if the dependency information has been calculated.
4966 /// Note that depenendency validity can vary between instructions within
4967 /// a single bundle.
4968 bool hasValidDependencies() const {
4969 return all_of(Bundle, [](const ScheduleEntity *SD) {
4970 return SD->hasValidDependencies();
4971 });
4972 }
4973
4974 /// Returns true if it is ready for scheduling, i.e. it has no more
4975 /// unscheduled depending instructions/bundles.
4976 bool isReady() const {
4977 assert(*this && "bundle must not be empty");
4978 return unscheduledDepsInBundle() == 0 && !isScheduled();
4979 }
4980
4981 /// Returns the bundle of scheduling data, associated with the current
4982 /// instruction.
4983 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
4984 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
4985 /// Adds an instruction to the bundle.
4986 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
4987
4988 /// Gets/sets the associated tree entry.
4989 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
4990 TreeEntry *getTreeEntry() const { return TE; }
4991
4992 static ScheduleBundle invalid() { return {false}; }
4993
4994 operator bool() const { return IsValid; }
4995
4996#ifndef NDEBUG
4997 void dump(raw_ostream &OS) const {
4998 if (!*this) {
4999 OS << "[]";
5000 return;
5001 }
5002 OS << '[';
5003 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5005 OS << "<Copyable>";
5006 OS << *SD->getInst();
5007 });
5008 OS << ']';
5009 }
5010
5011 LLVM_DUMP_METHOD void dump() const {
5012 dump(dbgs());
5013 dbgs() << '\n';
5014 }
5015#endif // NDEBUG
5016 };
5017
5018#ifndef NDEBUG
5020 const BoUpSLP::ScheduleBundle &Bundle) {
5021 Bundle.dump(OS);
5022 return OS;
5023 }
5024#endif
5025
5026 /// Contains all scheduling relevant data for the copyable instruction.
5027 /// It models the virtual instructions, supposed to replace the original
5028 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5029 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5030 /// instruction %virt = add %0, 0.
5031 class ScheduleCopyableData final : public ScheduleEntity {
5032 /// The source schedule data for the instruction.
5033 Instruction *Inst = nullptr;
5034 /// The edge information for the instruction.
5035 const EdgeInfo EI;
5036 /// This ScheduleData is in the current scheduling region if this matches
5037 /// the current SchedulingRegionID of BlockScheduling.
5038 int SchedulingRegionID = 0;
5039 /// Bundle, this data is part of.
5040 ScheduleBundle &Bundle;
5041
5042 public:
5043 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5044 const EdgeInfo &EI, ScheduleBundle &Bundle)
5045 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5046 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5047 static bool classof(const ScheduleEntity *Entity) {
5048 return Entity->getKind() == Kind::ScheduleCopyableData;
5049 }
5050
5051 /// Verify basic self consistency properties
5052 void verify() {
5053 if (hasValidDependencies()) {
5054 assert(UnscheduledDeps <= Dependencies && "invariant");
5055 } else {
5056 assert(UnscheduledDeps == Dependencies && "invariant");
5057 }
5058
5059 if (IsScheduled) {
5060 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5061 "unexpected scheduled state");
5062 }
5063 }
5064
5065 /// Returns true if the dependency information has been calculated.
5066 /// Note that depenendency validity can vary between instructions within
5067 /// a single bundle.
5068 bool hasValidDependencies() const {
5069 return Dependencies != ScheduleData::InvalidDeps;
5070 }
5071
5072 /// Returns true if it is ready for scheduling, i.e. it has no more
5073 /// unscheduled depending instructions/bundles.
5074 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5075
5076 /// Modifies the number of unscheduled dependencies for this instruction,
5077 /// and returns the number of remaining dependencies for the containing
5078 /// bundle.
5079 int incrementUnscheduledDeps(int Incr) {
5080 assert(hasValidDependencies() &&
5081 "increment of unscheduled deps would be meaningless");
5082 UnscheduledDeps += Incr;
5083 assert(UnscheduledDeps >= 0 && "invariant");
5084 return UnscheduledDeps;
5085 }
5086
5087 /// Sets the number of unscheduled dependencies to the number of
5088 /// dependencies.
5089 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5090
5091 /// Gets the number of unscheduled dependencies.
5092 int getUnscheduledDeps() const { return UnscheduledDeps; }
5093 /// Gets the number of dependencies.
5094 int getDependencies() const { return Dependencies; }
5095 /// Initializes the number of dependencies.
5096 void initDependencies() { Dependencies = 0; }
5097 /// Increments the number of dependencies.
5098 void incDependencies() { Dependencies++; }
5099
5100 /// Gets scheduling region ID.
5101 int getSchedulingRegionID() const { return SchedulingRegionID; }
5102
5103 /// Gets the instruction.
5104 Instruction *getInst() const { return Inst; }
5105
5106 /// Clears all dependency information.
5107 void clearDependencies() {
5108 Dependencies = ScheduleData::InvalidDeps;
5109 UnscheduledDeps = ScheduleData::InvalidDeps;
5110 IsScheduled = false;
5111 }
5112
5113 /// Gets the edge information.
5114 const EdgeInfo &getEdgeInfo() const { return EI; }
5115
5116 /// Gets the bundle.
5117 ScheduleBundle &getBundle() { return Bundle; }
5118 const ScheduleBundle &getBundle() const { return Bundle; }
5119
5120#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5121 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5122
5123 LLVM_DUMP_METHOD void dump() const {
5124 dump(dbgs());
5125 dbgs() << '\n';
5126 }
5127#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5128
5129 private:
5130 /// true, if it has valid dependency information. These nodes always have
5131 /// only single dependency.
5132 int Dependencies = ScheduleData::InvalidDeps;
5133
5134 /// The number of dependencies minus the number of dependencies of scheduled
5135 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5136 /// for scheduling.
5137 /// Note that this is negative as long as Dependencies is not calculated.
5138 int UnscheduledDeps = ScheduleData::InvalidDeps;
5139 };
5140
5141#ifndef NDEBUG
5142 friend inline raw_ostream &
5143 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5144 SD.dump(OS);
5145 return OS;
5146 }
5147#endif
5148
5149 friend struct GraphTraits<BoUpSLP *>;
5150 friend struct DOTGraphTraits<BoUpSLP *>;
5151
5152 /// Contains all scheduling data for a basic block.
5153 /// It does not schedules instructions, which are not memory read/write
5154 /// instructions and their operands are either constants, or arguments, or
5155 /// phis, or instructions from others blocks, or their users are phis or from
5156 /// the other blocks. The resulting vector instructions can be placed at the
5157 /// beginning of the basic block without scheduling (if operands does not need
5158 /// to be scheduled) or at the end of the block (if users are outside of the
5159 /// block). It allows to save some compile time and memory used by the
5160 /// compiler.
5161 /// ScheduleData is assigned for each instruction in between the boundaries of
5162 /// the tree entry, even for those, which are not part of the graph. It is
5163 /// required to correctly follow the dependencies between the instructions and
5164 /// their correct scheduling. The ScheduleData is not allocated for the
5165 /// instructions, which do not require scheduling, like phis, nodes with
5166 /// extractelements/insertelements only or nodes with instructions, with
5167 /// uses/operands outside of the block.
5168 struct BlockScheduling {
5169 BlockScheduling(BasicBlock *BB)
5170 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5171
5172 void clear() {
5173 ScheduledBundles.clear();
5174 ScheduledBundlesList.clear();
5175 ScheduleCopyableDataMap.clear();
5176 ScheduleCopyableDataMapByInst.clear();
5177 ScheduleCopyableDataMapByInstUser.clear();
5178 ScheduleCopyableDataMapByUsers.clear();
5179 ReadyInsts.clear();
5180 ScheduleStart = nullptr;
5181 ScheduleEnd = nullptr;
5182 FirstLoadStoreInRegion = nullptr;
5183 LastLoadStoreInRegion = nullptr;
5184 RegionHasStackSave = false;
5185
5186 // Reduce the maximum schedule region size by the size of the
5187 // previous scheduling run.
5188 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5189 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5190 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5191 ScheduleRegionSize = 0;
5192
5193 // Make a new scheduling region, i.e. all existing ScheduleData is not
5194 // in the new region yet.
5195 ++SchedulingRegionID;
5196 }
5197
5198 ScheduleData *getScheduleData(Instruction *I) {
5199 if (!I)
5200 return nullptr;
5201 if (BB != I->getParent())
5202 // Avoid lookup if can't possibly be in map.
5203 return nullptr;
5204 ScheduleData *SD = ScheduleDataMap.lookup(I);
5205 if (SD && isInSchedulingRegion(*SD))
5206 return SD;
5207 return nullptr;
5208 }
5209
5210 ScheduleData *getScheduleData(Value *V) {
5211 return getScheduleData(dyn_cast<Instruction>(V));
5212 }
5213
5214 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5215 /// operand number) and value.
5216 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5217 const Value *V) const {
5218 if (ScheduleCopyableDataMap.empty())
5219 return nullptr;
5220 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5221 if (It == ScheduleCopyableDataMap.end())
5222 return nullptr;
5223 ScheduleCopyableData *SD = It->getSecond().get();
5224 if (!isInSchedulingRegion(*SD))
5225 return nullptr;
5226 return SD;
5227 }
5228
5229 /// Returns the ScheduleCopyableData for the given user \p User, operand
5230 /// number and operand \p V.
5232 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5233 const Value *V) {
5234 if (ScheduleCopyableDataMapByInstUser.empty())
5235 return {};
5236 const auto It = ScheduleCopyableDataMapByInstUser.find(
5237 std::make_pair(std::make_pair(User, OperandIdx), V));
5238 if (It == ScheduleCopyableDataMapByInstUser.end())
5239 return {};
5241 for (ScheduleCopyableData *SD : It->getSecond()) {
5242 if (isInSchedulingRegion(*SD))
5243 Res.push_back(SD);
5244 }
5245 return Res;
5246 }
5247
5248 /// Returns true if all operands of the given instruction \p User are
5249 /// replaced by copyable data.
5250 /// \param User The user instruction.
5251 /// \param Op The operand, which might be replaced by the copyable data.
5252 /// \param SLP The SLP tree.
5253 /// \param NumOps The number of operands used. If the instruction uses the
5254 /// same operand several times, check for the first use, then the second,
5255 /// etc.
5256 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5257 Instruction *Op, BoUpSLP &SLP,
5258 unsigned NumOps) const {
5259 assert(NumOps > 0 && "No operands");
5260 if (ScheduleCopyableDataMap.empty())
5261 return false;
5262 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5263 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5264 for (const Use &U : User->operands()) {
5265 if (U.get() != Op)
5266 continue;
5267 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5268 if (Entries.empty())
5269 return false;
5270 // Check all tree entries, if they have operands replaced by copyable
5271 // data.
5272 for (TreeEntry *TE : Entries) {
5273 // Check if the user is commutative.
5274 // The commutatives are handled later, as their oeprands can be
5275 // reordered.
5276 // Same applies even for non-commutative cmps, because we can invert
5277 // their predicate potentially and, thus, reorder the operands.
5278 bool IsCommutativeUser =
5279 ::isCommutative(User) ||
5280 ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
5281 EdgeInfo EI(TE, U.getOperandNo());
5282 if (!IsCommutativeUser && !isa<CmpInst>(User)) {
5283 unsigned &OpCnt =
5284 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
5285 if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)
5286 return false;
5287 // Found copyable operand - continue.
5288 ++OpCnt;
5289 continue;
5290 }
5291 ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5292 .first->getSecond();
5293 }
5294 }
5295 // Check the commutative/cmp entries.
5296 if (!PotentiallyReorderedEntriesCount.empty()) {
5297 for (auto &P : PotentiallyReorderedEntriesCount) {
5298 auto *It = find(P.first->Scalars, User);
5299 assert(It != P.first->Scalars.end() &&
5300 "User is not in the tree entry");
5301 int Lane = std::distance(P.first->Scalars.begin(), It);
5302 assert(Lane >= 0 && "Lane is not found");
5303 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5304 Lane = P.first->ReorderIndices[Lane];
5305 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5306 "Couldn't find extract lane");
5307 SmallVector<unsigned> OpIndices;
5308 for (unsigned OpIdx :
5310 P.first->getMainOp()))) {
5311 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5312 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5313 --P.getSecond();
5314 }
5315 }
5316 return all_of(PotentiallyReorderedEntriesCount,
5317 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5318 return P.second == NumOps - 1;
5319 });
5320 }
5321 return true;
5322 }
5323
5325 getScheduleCopyableData(const Instruction *I) const {
5326 if (ScheduleCopyableDataMapByInst.empty())
5327 return {};
5328 const auto It = ScheduleCopyableDataMapByInst.find(I);
5329 if (It == ScheduleCopyableDataMapByInst.end())
5330 return {};
5332 for (ScheduleCopyableData *SD : It->getSecond()) {
5333 if (isInSchedulingRegion(*SD))
5334 Res.push_back(SD);
5335 }
5336 return Res;
5337 }
5338
5340 getScheduleCopyableDataUsers(const Instruction *User) const {
5341 if (ScheduleCopyableDataMapByUsers.empty())
5342 return {};
5343 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5344 if (It == ScheduleCopyableDataMapByUsers.end())
5345 return {};
5347 for (ScheduleCopyableData *SD : It->getSecond()) {
5348 if (isInSchedulingRegion(*SD))
5349 Res.push_back(SD);
5350 }
5351 return Res;
5352 }
5353
5354 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5355 Instruction *I,
5356 int SchedulingRegionID,
5357 ScheduleBundle &Bundle) {
5358 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5359 ScheduleCopyableData *CD =
5360 ScheduleCopyableDataMap
5361 .try_emplace(std::make_pair(EI, I),
5362 std::make_unique<ScheduleCopyableData>(
5363 SchedulingRegionID, I, EI, Bundle))
5364 .first->getSecond()
5365 .get();
5366 ScheduleCopyableDataMapByInst[I].push_back(CD);
5367 if (EI.UserTE) {
5368 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5369 const auto *It = find(Op, I);
5370 assert(It != Op.end() && "Lane not set");
5371 SmallPtrSet<Instruction *, 4> Visited;
5372 do {
5373 int Lane = std::distance(Op.begin(), It);
5374 assert(Lane >= 0 && "Lane not set");
5375 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5376 !EI.UserTE->ReorderIndices.empty())
5377 Lane = EI.UserTE->ReorderIndices[Lane];
5378 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5379 "Couldn't find extract lane");
5380 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5381 if (!Visited.insert(In).second) {
5382 It = find(make_range(std::next(It), Op.end()), I);
5383 continue;
5384 }
5385 ScheduleCopyableDataMapByInstUser
5386 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5387 .first->getSecond()
5388 .push_back(CD);
5389 ScheduleCopyableDataMapByUsers.try_emplace(I)
5390 .first->getSecond()
5391 .insert(CD);
5392 // Remove extra deps for users, becoming non-immediate users of the
5393 // instruction. It may happen, if the chain of same copyable elements
5394 // appears in the tree.
5395 if (In == I) {
5396 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5397 if (ScheduleCopyableData *UserCD =
5398 getScheduleCopyableData(UserEI, In))
5399 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5400 }
5401 It = find(make_range(std::next(It), Op.end()), I);
5402 } while (It != Op.end());
5403 } else {
5404 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5405 CD);
5406 }
5407 return *CD;
5408 }
5409
5410 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5411 auto *I = dyn_cast<Instruction>(V);
5412 if (!I)
5413 return {};
5414 auto It = ScheduledBundles.find(I);
5415 if (It == ScheduledBundles.end())
5416 return {};
5417 return It->getSecond();
5418 }
5419
5420 /// Returns true if the entity is in the scheduling region.
5421 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5422 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5423 return Data->getSchedulingRegionID() == SchedulingRegionID;
5424 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5425 return CD->getSchedulingRegionID() == SchedulingRegionID;
5426 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5427 [&](const ScheduleEntity *BundleMember) {
5428 return isInSchedulingRegion(*BundleMember);
5429 });
5430 }
5431
5432 /// Marks an instruction as scheduled and puts all dependent ready
5433 /// instructions into the ready-list.
5434 template <typename ReadyListType>
5435 void schedule(const BoUpSLP &R, const InstructionsState &S,
5436 const EdgeInfo &EI, ScheduleEntity *Data,
5437 ReadyListType &ReadyList) {
5438 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5440 // Handle the def-use chain dependencies.
5441
5442 // Decrement the unscheduled counter and insert to ready list if ready.
5443 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5444 if ((IsControl || Data->hasValidDependencies()) &&
5445 Data->incrementUnscheduledDeps(-1) == 0) {
5446 // There are no more unscheduled dependencies after
5447 // decrementing, so we can put the dependent instruction
5448 // into the ready list.
5449 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5451 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5452 CopyableBundle.push_back(&CD->getBundle());
5453 Bundles = CopyableBundle;
5454 } else {
5455 Bundles = getScheduleBundles(Data->getInst());
5456 }
5457 if (!Bundles.empty()) {
5458 for (ScheduleBundle *Bundle : Bundles) {
5459 if (Bundle->unscheduledDepsInBundle() == 0) {
5460 assert(!Bundle->isScheduled() &&
5461 "already scheduled bundle gets ready");
5462 ReadyList.insert(Bundle);
5464 << "SLP: gets ready: " << *Bundle << "\n");
5465 }
5466 }
5467 return;
5468 }
5469 assert(!Data->isScheduled() &&
5470 "already scheduled bundle gets ready");
5472 "Expected non-copyable data");
5473 ReadyList.insert(Data);
5474 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5475 }
5476 };
5477
5478 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5479 Instruction *I) {
5480 if (!ScheduleCopyableDataMap.empty()) {
5482 getScheduleCopyableData(User, OpIdx, I);
5483 for (ScheduleCopyableData *CD : CopyableData)
5484 DecrUnsched(CD, /*IsControl=*/false);
5485 if (!CopyableData.empty())
5486 return;
5487 }
5488 if (ScheduleData *OpSD = getScheduleData(I))
5489 DecrUnsched(OpSD, /*IsControl=*/false);
5490 };
5491
5492 // If BundleMember is a vector bundle, its operands may have been
5493 // reordered during buildTree(). We therefore need to get its operands
5494 // through the TreeEntry.
5495 if (!Bundles.empty()) {
5496 auto *In = BundleMember->getInst();
5497 // Count uses of each instruction operand.
5498 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5499 unsigned TotalOpCount = 0;
5500 if (isa<ScheduleCopyableData>(BundleMember)) {
5501 // Copyable data is used only once (uses itself).
5502 TotalOpCount = OperandsUses[In] = 1;
5503 } else {
5504 for (const Use &U : In->operands()) {
5505 if (auto *I = dyn_cast<Instruction>(U.get())) {
5506 auto Res = OperandsUses.try_emplace(I, 0);
5507 ++Res.first->getSecond();
5508 ++TotalOpCount;
5509 }
5510 }
5511 }
5512 // Decrement the unscheduled counter and insert to ready list if
5513 // ready.
5514 auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
5515 unsigned OpIdx) {
5516 if (!ScheduleCopyableDataMap.empty()) {
5517 const EdgeInfo EI = {UserTE, OpIdx};
5518 if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
5519 DecrUnsched(CD, /*IsControl=*/false);
5520 return;
5521 }
5522 }
5523 auto It = OperandsUses.find(I);
5524 assert(It != OperandsUses.end() && "Operand not found");
5525 if (It->second > 0) {
5526 --It->getSecond();
5527 assert(TotalOpCount > 0 && "No more operands to decrement");
5528 --TotalOpCount;
5529 if (ScheduleData *OpSD = getScheduleData(I))
5530 DecrUnsched(OpSD, /*IsControl=*/false);
5531 }
5532 };
5533
5534 for (ScheduleBundle *Bundle : Bundles) {
5535 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5536 break;
5537 // Need to search for the lane since the tree entry can be
5538 // reordered.
5539 int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
5540 find(Bundle->getTreeEntry()->Scalars, In));
5541 assert(Lane >= 0 && "Lane not set");
5542 if (isa<StoreInst>(In) &&
5543 !Bundle->getTreeEntry()->ReorderIndices.empty())
5544 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5545 assert(Lane < static_cast<int>(
5546 Bundle->getTreeEntry()->Scalars.size()) &&
5547 "Couldn't find extract lane");
5548
5549 // Since vectorization tree is being built recursively this
5550 // assertion ensures that the tree entry has all operands set before
5551 // reaching this code. Couple of exceptions known at the moment are
5552 // extracts where their second (immediate) operand is not added.
5553 // Since immediates do not affect scheduler behavior this is
5554 // considered okay.
5555 assert(In &&
5557 In->getNumOperands() ==
5558 Bundle->getTreeEntry()->getNumOperands() ||
5559 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5560 "Missed TreeEntry operands?");
5561
5562 for (unsigned OpIdx :
5563 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5564 if (auto *I = dyn_cast<Instruction>(
5565 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5566 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I
5567 << "\n");
5568 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
5569 }
5570 }
5571 } else {
5572 // If BundleMember is a stand-alone instruction, no operand reordering
5573 // has taken place, so we directly access its operands.
5574 for (Use &U : BundleMember->getInst()->operands()) {
5575 if (auto *I = dyn_cast<Instruction>(U.get())) {
5577 << "SLP: check for readiness (def): " << *I << "\n");
5578 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5579 }
5580 }
5581 }
5582 // Handle the memory dependencies.
5583 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5584 if (!SD)
5585 return;
5586 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5587 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5588 if (!VisitedMemory.insert(MemoryDep).second)
5589 continue;
5590 // There are no more unscheduled dependencies after decrementing,
5591 // so we can put the dependent instruction into the ready list.
5592 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5593 << *MemoryDep << "\n");
5594 DecrUnsched(MemoryDep);
5595 }
5596 // Handle the control dependencies.
5597 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5598 for (ScheduleData *Dep : SD->getControlDependencies()) {
5599 if (!VisitedControl.insert(Dep).second)
5600 continue;
5601 // There are no more unscheduled dependencies after decrementing,
5602 // so we can put the dependent instruction into the ready list.
5604 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5605 DecrUnsched(Dep, /*IsControl=*/true);
5606 }
5607 };
5608 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5609 SD->setScheduled(/*Scheduled=*/true);
5610 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5613 Instruction *In = SD->getInst();
5614 if (R.isVectorized(In)) {
5615 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5616 for (TreeEntry *TE : Entries) {
5618 In->getNumOperands() != TE->getNumOperands())
5619 continue;
5620 auto &BundlePtr =
5621 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5622 BundlePtr->setTreeEntry(TE);
5623 BundlePtr->add(SD);
5624 Bundles.push_back(BundlePtr.get());
5625 }
5626 }
5627 ProcessBundleMember(SD, Bundles);
5628 } else {
5629 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5630 Bundle.setScheduled(/*Scheduled=*/true);
5631 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5632 auto AreAllBundlesScheduled =
5633 [&](const ScheduleEntity *SD,
5634 ArrayRef<ScheduleBundle *> SDBundles) {
5636 return true;
5637 return !SDBundles.empty() &&
5638 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5639 return SDBundle->isScheduled();
5640 });
5641 };
5642 for (ScheduleEntity *SD : Bundle.getBundle()) {
5645 SDBundles = getScheduleBundles(SD->getInst());
5646 if (AreAllBundlesScheduled(SD, SDBundles)) {
5647 SD->setScheduled(/*Scheduled=*/true);
5648 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5649 : SDBundles);
5650 }
5651 }
5652 }
5653 }
5654
5655 /// Verify basic self consistency properties of the data structure.
5656 void verify() {
5657 if (!ScheduleStart)
5658 return;
5659
5660 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5661 ScheduleStart->comesBefore(ScheduleEnd) &&
5662 "Not a valid scheduling region?");
5663
5664 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5665 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5666 if (!Bundles.empty()) {
5667 for (ScheduleBundle *Bundle : Bundles) {
5668 assert(isInSchedulingRegion(*Bundle) &&
5669 "primary schedule data not in window?");
5670 Bundle->verify();
5671 }
5672 continue;
5673 }
5674 auto *SD = getScheduleData(I);
5675 if (!SD)
5676 continue;
5677 assert(isInSchedulingRegion(*SD) &&
5678 "primary schedule data not in window?");
5679 SD->verify();
5680 }
5681
5682 assert(all_of(ReadyInsts,
5683 [](const ScheduleEntity *Bundle) {
5684 return Bundle->isReady();
5685 }) &&
5686 "item in ready list not ready?");
5687 }
5688
5689 /// Put all instructions into the ReadyList which are ready for scheduling.
5690 template <typename ReadyListType>
5691 void initialFillReadyList(ReadyListType &ReadyList) {
5692 SmallPtrSet<ScheduleBundle *, 16> Visited;
5693 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5694 ScheduleData *SD = getScheduleData(I);
5695 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5696 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5697 !Bundles.empty()) {
5698 for (ScheduleBundle *Bundle : Bundles) {
5699 if (!Visited.insert(Bundle).second)
5700 continue;
5701 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5702 ReadyList.insert(Bundle);
5703 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5704 << *Bundle << "\n");
5705 }
5706 }
5707 continue;
5708 }
5709 ReadyList.insert(SD);
5711 << "SLP: initially in ready list: " << *SD << "\n");
5712 }
5713 }
5714 }
5715
5716 /// Build a bundle from the ScheduleData nodes corresponding to the
5717 /// scalar instruction for each lane.
5718 /// \param VL The list of scalar instructions.
5719 /// \param S The state of the instructions.
5720 /// \param EI The edge in the SLP graph or the user node/operand number.
5721 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5722 const InstructionsState &S, const EdgeInfo &EI);
5723
5724 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5725 /// cyclic dependencies. This is only a dry-run, no instructions are
5726 /// actually moved at this stage.
5727 /// \returns the scheduling bundle. The returned Optional value is not
5728 /// std::nullopt if \p VL is allowed to be scheduled.
5729 std::optional<ScheduleBundle *>
5730 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5731 const InstructionsState &S, const EdgeInfo &EI);
5732
5733 /// Allocates schedule data chunk.
5734 ScheduleData *allocateScheduleDataChunks();
5735
5736 /// Extends the scheduling region so that V is inside the region.
5737 /// \returns true if the region size is within the limit.
5738 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5739
5740 /// Initialize the ScheduleData structures for new instructions in the
5741 /// scheduling region.
5742 void initScheduleData(Instruction *FromI, Instruction *ToI,
5743 ScheduleData *PrevLoadStore,
5744 ScheduleData *NextLoadStore);
5745
5746 /// Updates the dependency information of a bundle and of all instructions/
5747 /// bundles which depend on the original bundle.
5748 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5749 BoUpSLP *SLP,
5750 ArrayRef<ScheduleData *> ControlDeps = {});
5751
5752 /// Sets all instruction in the scheduling region to un-scheduled.
5753 void resetSchedule();
5754
5755 BasicBlock *BB;
5756
5757 /// Simple memory allocation for ScheduleData.
5759
5760 /// The size of a ScheduleData array in ScheduleDataChunks.
5761 int ChunkSize;
5762
5763 /// The allocator position in the current chunk, which is the last entry
5764 /// of ScheduleDataChunks.
5765 int ChunkPos;
5766
5767 /// Attaches ScheduleData to Instruction.
5768 /// Note that the mapping survives during all vectorization iterations, i.e.
5769 /// ScheduleData structures are recycled.
5770 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5771
5772 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5773 /// number) and the operand instruction, represented as copyable element.
5774 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5775 std::unique_ptr<ScheduleCopyableData>>
5776 ScheduleCopyableDataMap;
5777
5778 /// Represents mapping between instruction and all related
5779 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5780 /// element). The SLP tree may contain several representations of the same
5781 /// instruction.
5782 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5783 ScheduleCopyableDataMapByInst;
5784
5785 /// Represents mapping between user value and operand number, the operand
5786 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5787 /// the same user may refernce the same operand in different tree entries
5788 /// and the operand may be modelled by the different copyable data element.
5789 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5791 ScheduleCopyableDataMapByInstUser;
5792
5793 /// Represents mapping between instruction and all related
5794 /// ScheduleCopyableData. It represents the mapping between the actual
5795 /// instruction and the last copyable data element in the chain. E.g., if
5796 /// the graph models the following instructions:
5797 /// %0 = non-add instruction ...
5798 /// ...
5799 /// %4 = add %3, 1
5800 /// %5 = add %4, 1
5801 /// %6 = insertelement poison, %0, 0
5802 /// %7 = insertelement %6, %5, 1
5803 /// And the graph is modeled as:
5804 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
5805 /// -> [1, 0] -> [%1, 0]
5806 ///
5807 /// this map will map %0 only to the copyable element <1>, which is the last
5808 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
5809 /// keep the map to <0>, not the %0.
5810 SmallDenseMap<const Instruction *,
5811 SmallSetVector<ScheduleCopyableData *, 4>>
5812 ScheduleCopyableDataMapByUsers;
5813
5814 /// Attaches ScheduleBundle to Instruction.
5815 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5816 ScheduledBundles;
5817 /// The list of ScheduleBundles.
5818 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5819
5820 /// The ready-list for scheduling (only used for the dry-run).
5821 SetVector<ScheduleEntity *> ReadyInsts;
5822
5823 /// The first instruction of the scheduling region.
5824 Instruction *ScheduleStart = nullptr;
5825
5826 /// The first instruction _after_ the scheduling region.
5827 Instruction *ScheduleEnd = nullptr;
5828
5829 /// The first memory accessing instruction in the scheduling region
5830 /// (can be null).
5831 ScheduleData *FirstLoadStoreInRegion = nullptr;
5832
5833 /// The last memory accessing instruction in the scheduling region
5834 /// (can be null).
5835 ScheduleData *LastLoadStoreInRegion = nullptr;
5836
5837 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5838 /// region? Used to optimize the dependence calculation for the
5839 /// common case where there isn't.
5840 bool RegionHasStackSave = false;
5841
5842 /// The current size of the scheduling region.
5843 int ScheduleRegionSize = 0;
5844
5845 /// The maximum size allowed for the scheduling region.
5846 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
5847
5848 /// The ID of the scheduling region. For a new vectorization iteration this
5849 /// is incremented which "removes" all ScheduleData from the region.
5850 /// Make sure that the initial SchedulingRegionID is greater than the
5851 /// initial SchedulingRegionID in ScheduleData (which is 0).
5852 int SchedulingRegionID = 1;
5853 };
5854
5855 /// Attaches the BlockScheduling structures to basic blocks.
5856 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5857
5858 /// Performs the "real" scheduling. Done before vectorization is actually
5859 /// performed in a basic block.
5860 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
5861
5862 /// List of users to ignore during scheduling and that don't need extracting.
5863 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
5864
5865 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
5866 /// sorted SmallVectors of unsigned.
5867 struct OrdersTypeDenseMapInfo {
5868 static OrdersType getEmptyKey() {
5869 OrdersType V;
5870 V.push_back(~1U);
5871 return V;
5872 }
5873
5874 static OrdersType getTombstoneKey() {
5875 OrdersType V;
5876 V.push_back(~2U);
5877 return V;
5878 }
5879
5880 static unsigned getHashValue(const OrdersType &V) {
5881 return static_cast<unsigned>(hash_combine_range(V));
5882 }
5883
5884 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
5885 return LHS == RHS;
5886 }
5887 };
5888
5889 // Analysis and block reference.
5890 Function *F;
5891 ScalarEvolution *SE;
5892 TargetTransformInfo *TTI;
5893 TargetLibraryInfo *TLI;
5894 LoopInfo *LI;
5895 DominatorTree *DT;
5896 AssumptionCache *AC;
5897 DemandedBits *DB;
5898 const DataLayout *DL;
5899 OptimizationRemarkEmitter *ORE;
5900
5901 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
5902 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
5903
5904 /// Instruction builder to construct the vectorized tree.
5905 IRBuilder<TargetFolder> Builder;
5906
5907 /// A map of scalar integer values to the smallest bit width with which they
5908 /// can legally be represented. The values map to (width, signed) pairs,
5909 /// where "width" indicates the minimum bit width and "signed" is True if the
5910 /// value must be signed-extended, rather than zero-extended, back to its
5911 /// original width.
5912 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
5913
5914 /// Final size of the reduced vector, if the current graph represents the
5915 /// input for the reduction and it was possible to narrow the size of the
5916 /// reduction.
5917 unsigned ReductionBitWidth = 0;
5918
5919 /// Canonical graph size before the transformations.
5920 unsigned BaseGraphSize = 1;
5921
5922 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
5923 /// type sizes, used in the tree.
5924 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5925
5926 /// Indices of the vectorized nodes, which supposed to be the roots of the new
5927 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
5928 DenseSet<unsigned> ExtraBitWidthNodes;
5929};
5930
5931} // end namespace slpvectorizer
5932
5933template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {
5937 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
5938 SecondInfo::getEmptyKey());
5939 }
5940
5942 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
5943 SecondInfo::getTombstoneKey());
5944 }
5945
5946 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
5947 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
5948 SecondInfo::getHashValue(Val.EdgeIdx));
5949 }
5950
5951 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
5952 const BoUpSLP::EdgeInfo &RHS) {
5953 return LHS == RHS;
5954 }
5955};
5956
5957template <> struct GraphTraits<BoUpSLP *> {
5958 using TreeEntry = BoUpSLP::TreeEntry;
5959
5960 /// NodeRef has to be a pointer per the GraphWriter.
5962
5963 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
5964
5965 /// Add the VectorizableTree to the index iterator to be able to return
5966 /// TreeEntry pointers.
5968 : public iterator_adaptor_base<
5969 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5971
5975
5976 NodeRef operator*() { return I->UserTE; }
5977 };
5978
5980 return R.VectorizableTree[0].get();
5981 }
5982
5984 return {&N->UserTreeIndex, N->Container};
5985 }
5986
5988 return {&N->UserTreeIndex + 1, N->Container};
5989 }
5990
5991 /// For the node iterator we just need to turn the TreeEntry iterator into a
5992 /// TreeEntry* iterator so that it dereferences to NodeRef.
5994 using ItTy = ContainerTy::iterator;
5995 ItTy It;
5996
5997 public:
5998 nodes_iterator(const ItTy &It2) : It(It2) {}
5999 NodeRef operator*() { return It->get(); }
6001 ++It;
6002 return *this;
6003 }
6004 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6005 };
6006
6008 return nodes_iterator(R->VectorizableTree.begin());
6009 }
6010
6012 return nodes_iterator(R->VectorizableTree.end());
6013 }
6014
6015 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6016};
6017
6018template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
6019 using TreeEntry = BoUpSLP::TreeEntry;
6020
6021 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6022
6023 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6024 std::string Str;
6025 raw_string_ostream OS(Str);
6026 OS << Entry->Idx << ".\n";
6027 if (isSplat(Entry->Scalars))
6028 OS << "<splat> ";
6029 for (auto *V : Entry->Scalars) {
6030 OS << *V;
6031 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6032 return EU.Scalar == V;
6033 }))
6034 OS << " <extract>";
6035 OS << "\n";
6036 }
6037 return Str;
6038 }
6039
6040 static std::string getNodeAttributes(const TreeEntry *Entry,
6041 const BoUpSLP *) {
6042 if (Entry->isGather())
6043 return "color=red";
6044 if (Entry->State == TreeEntry::ScatterVectorize ||
6045 Entry->State == TreeEntry::StridedVectorize ||
6046 Entry->State == TreeEntry::CompressVectorize)
6047 return "color=blue";
6048 return "";
6049 }
6050};
6051
6052} // end namespace llvm
6053
6056 for (auto *I : DeletedInstructions) {
6057 if (!I->getParent()) {
6058 // Temporarily insert instruction back to erase them from parent and
6059 // memory later.
6060 if (isa<PHINode>(I))
6061 // Phi nodes must be the very first instructions in the block.
6062 I->insertBefore(F->getEntryBlock(),
6063 F->getEntryBlock().getFirstNonPHIIt());
6064 else
6065 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6066 continue;
6067 }
6068 for (Use &U : I->operands()) {
6069 auto *Op = dyn_cast<Instruction>(U.get());
6070 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6072 DeadInsts.emplace_back(Op);
6073 }
6074 I->dropAllReferences();
6075 }
6076 for (auto *I : DeletedInstructions) {
6077 assert(I->use_empty() &&
6078 "trying to erase instruction with users.");
6079 I->eraseFromParent();
6080 }
6081
6082 // Cleanup any dead scalar code feeding the vectorized instructions
6084
6085#ifdef EXPENSIVE_CHECKS
6086 // If we could guarantee that this call is not extremely slow, we could
6087 // remove the ifdef limitation (see PR47712).
6088 assert(!verifyFunction(*F, &dbgs()));
6089#endif
6090}
6091
6092/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6093/// contains original mask for the scalars reused in the node. Procedure
6094/// transform this mask in accordance with the given \p Mask.
6096 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6097 "Expected non-empty mask.");
6098 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6099 Prev.swap(Reuses);
6100 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6101 if (Mask[I] != PoisonMaskElem)
6102 Reuses[Mask[I]] = Prev[I];
6103}
6104
6105/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6106/// the original order of the scalars. Procedure transforms the provided order
6107/// in accordance with the given \p Mask. If the resulting \p Order is just an
6108/// identity order, \p Order is cleared.
6110 bool BottomOrder = false) {
6111 assert(!Mask.empty() && "Expected non-empty mask.");
6112 unsigned Sz = Mask.size();
6113 if (BottomOrder) {
6114 SmallVector<unsigned> PrevOrder;
6115 if (Order.empty()) {
6116 PrevOrder.resize(Sz);
6117 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6118 } else {
6119 PrevOrder.swap(Order);
6120 }
6121 Order.assign(Sz, Sz);
6122 for (unsigned I = 0; I < Sz; ++I)
6123 if (Mask[I] != PoisonMaskElem)
6124 Order[I] = PrevOrder[Mask[I]];
6125 if (all_of(enumerate(Order), [&](const auto &Data) {
6126 return Data.value() == Sz || Data.index() == Data.value();
6127 })) {
6128 Order.clear();
6129 return;
6130 }
6131 fixupOrderingIndices(Order);
6132 return;
6133 }
6134 SmallVector<int> MaskOrder;
6135 if (Order.empty()) {
6136 MaskOrder.resize(Sz);
6137 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6138 } else {
6139 inversePermutation(Order, MaskOrder);
6140 }
6141 reorderReuses(MaskOrder, Mask);
6142 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6143 Order.clear();
6144 return;
6145 }
6146 Order.assign(Sz, Sz);
6147 for (unsigned I = 0; I < Sz; ++I)
6148 if (MaskOrder[I] != PoisonMaskElem)
6149 Order[MaskOrder[I]] = I;
6150 fixupOrderingIndices(Order);
6151}
6152
6153std::optional<BoUpSLP::OrdersType>
6154BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6155 bool TopToBottom, bool IgnoreReorder) {
6156 assert(TE.isGather() && "Expected gather node only.");
6157 // Try to find subvector extract/insert patterns and reorder only such
6158 // patterns.
6159 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6160 Type *ScalarTy = GatheredScalars.front()->getType();
6161 size_t NumScalars = GatheredScalars.size();
6162 if (!isValidElementType(ScalarTy))
6163 return std::nullopt;
6164 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6165 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6166 SmallVector<int> ExtractMask;
6167 SmallVector<int> Mask;
6170 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6172 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6173 /*ForOrder=*/true);
6174 // No shuffled operands - ignore.
6175 if (GatherShuffles.empty() && ExtractShuffles.empty())
6176 return std::nullopt;
6177 OrdersType CurrentOrder(NumScalars, NumScalars);
6178 if (GatherShuffles.size() == 1 &&
6179 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6180 Entries.front().front()->isSame(TE.Scalars)) {
6181 // If the full matched node in whole tree rotation - no need to consider the
6182 // matching order, rotating the whole tree.
6183 if (TopToBottom)
6184 return std::nullopt;
6185 // No need to keep the order for the same user node.
6186 if (Entries.front().front()->UserTreeIndex.UserTE ==
6187 TE.UserTreeIndex.UserTE)
6188 return std::nullopt;
6189 // No need to keep the order for the matched root node, if it can be freely
6190 // reordered.
6191 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6192 return std::nullopt;
6193 // If shuffling 2 elements only and the matching node has reverse reuses -
6194 // no need to count order, both work fine.
6195 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6196 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6197 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6198 [](const auto &P) {
6199 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6200 }))
6201 return std::nullopt;
6202
6203 // Perfect match in the graph, will reuse the previously vectorized
6204 // node. Cost is 0.
6205 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6206 return CurrentOrder;
6207 }
6208 auto IsSplatMask = [](ArrayRef<int> Mask) {
6209 int SingleElt = PoisonMaskElem;
6210 return all_of(Mask, [&](int I) {
6211 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6212 SingleElt = I;
6213 return I == PoisonMaskElem || I == SingleElt;
6214 });
6215 };
6216 // Exclusive broadcast mask - ignore.
6217 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6218 (Entries.size() != 1 ||
6219 Entries.front().front()->ReorderIndices.empty())) ||
6220 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6221 return std::nullopt;
6222 SmallBitVector ShuffledSubMasks(NumParts);
6223 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6224 ArrayRef<int> Mask, int PartSz, int NumParts,
6225 function_ref<unsigned(unsigned)> GetVF) {
6226 for (int I : seq<int>(0, NumParts)) {
6227 if (ShuffledSubMasks.test(I))
6228 continue;
6229 const int VF = GetVF(I);
6230 if (VF == 0)
6231 continue;
6232 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6233 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6234 // Shuffle of at least 2 vectors - ignore.
6235 if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {
6236 llvm::fill(Slice, NumScalars);
6237 ShuffledSubMasks.set(I);
6238 continue;
6239 }
6240 // Try to include as much elements from the mask as possible.
6241 int FirstMin = INT_MAX;
6242 int SecondVecFound = false;
6243 for (int K : seq<int>(Limit)) {
6244 int Idx = Mask[I * PartSz + K];
6245 if (Idx == PoisonMaskElem) {
6246 Value *V = GatheredScalars[I * PartSz + K];
6247 if (isConstant(V) && !isa<PoisonValue>(V)) {
6248 SecondVecFound = true;
6249 break;
6250 }
6251 continue;
6252 }
6253 if (Idx < VF) {
6254 if (FirstMin > Idx)
6255 FirstMin = Idx;
6256 } else {
6257 SecondVecFound = true;
6258 break;
6259 }
6260 }
6261 FirstMin = (FirstMin / PartSz) * PartSz;
6262 // Shuffle of at least 2 vectors - ignore.
6263 if (SecondVecFound) {
6264 llvm::fill(Slice, NumScalars);
6265 ShuffledSubMasks.set(I);
6266 continue;
6267 }
6268 for (int K : seq<int>(Limit)) {
6269 int Idx = Mask[I * PartSz + K];
6270 if (Idx == PoisonMaskElem)
6271 continue;
6272 Idx -= FirstMin;
6273 if (Idx >= PartSz) {
6274 SecondVecFound = true;
6275 break;
6276 }
6277 if (CurrentOrder[I * PartSz + Idx] >
6278 static_cast<unsigned>(I * PartSz + K) &&
6279 CurrentOrder[I * PartSz + Idx] !=
6280 static_cast<unsigned>(I * PartSz + Idx))
6281 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6282 }
6283 // Shuffle of at least 2 vectors - ignore.
6284 if (SecondVecFound) {
6285 llvm::fill(Slice, NumScalars);
6286 ShuffledSubMasks.set(I);
6287 continue;
6288 }
6289 }
6290 };
6291 int PartSz = getPartNumElems(NumScalars, NumParts);
6292 if (!ExtractShuffles.empty())
6293 TransformMaskToOrder(
6294 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6295 if (!ExtractShuffles[I])
6296 return 0U;
6297 unsigned VF = 0;
6298 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6299 for (unsigned Idx : seq<unsigned>(Sz)) {
6300 int K = I * PartSz + Idx;
6301 if (ExtractMask[K] == PoisonMaskElem)
6302 continue;
6303 if (!TE.ReuseShuffleIndices.empty())
6304 K = TE.ReuseShuffleIndices[K];
6305 if (K == PoisonMaskElem)
6306 continue;
6307 if (!TE.ReorderIndices.empty())
6308 K = std::distance(TE.ReorderIndices.begin(),
6309 find(TE.ReorderIndices, K));
6310 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6311 if (!EI)
6312 continue;
6313 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6314 ->getElementCount()
6315 .getKnownMinValue());
6316 }
6317 return VF;
6318 });
6319 // Check special corner case - single shuffle of the same entry.
6320 if (GatherShuffles.size() == 1 && NumParts != 1) {
6321 if (ShuffledSubMasks.any())
6322 return std::nullopt;
6323 PartSz = NumScalars;
6324 NumParts = 1;
6325 }
6326 if (!Entries.empty())
6327 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6328 if (!GatherShuffles[I])
6329 return 0U;
6330 return std::max(Entries[I].front()->getVectorFactor(),
6331 Entries[I].back()->getVectorFactor());
6332 });
6333 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6334 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6335 return std::nullopt;
6336 return std::move(CurrentOrder);
6337}
6338
6339static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6340 const TargetLibraryInfo &TLI,
6341 bool CompareOpcodes = true) {
6344 return false;
6345 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6346 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6347 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6348 (!GEP2 || GEP2->getNumOperands() == 2) &&
6349 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6350 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6351 !CompareOpcodes ||
6352 (GEP1 && GEP2 &&
6353 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6354}
6355
6356/// Calculates minimal alignment as a common alignment.
6357template <typename T>
6359 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6360 for (Value *V : VL)
6361 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6362 return CommonAlignment;
6363}
6364
6365/// Check if \p Order represents reverse order.
6367 assert(!Order.empty() &&
6368 "Order is empty. Please check it before using isReverseOrder.");
6369 unsigned Sz = Order.size();
6370 return all_of(enumerate(Order), [&](const auto &Pair) {
6371 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6372 });
6373}
6374
6375/// Checks if the provided list of pointers \p Pointers represents the strided
6376/// pointers for type ElemTy. If they are not, nullptr is returned.
6377/// Otherwise, SCEV* of the stride value is returned.
6378static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6379 const DataLayout &DL, ScalarEvolution &SE,
6380 SmallVectorImpl<unsigned> &SortedIndices) {
6382 const SCEV *PtrSCEVLowest = nullptr;
6383 const SCEV *PtrSCEVHighest = nullptr;
6384 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6385 // addresses).
6386 for (Value *Ptr : PointerOps) {
6387 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6388 if (!PtrSCEV)
6389 return nullptr;
6390 SCEVs.push_back(PtrSCEV);
6391 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6392 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6393 continue;
6394 }
6395 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6396 if (isa<SCEVCouldNotCompute>(Diff))
6397 return nullptr;
6398 if (Diff->isNonConstantNegative()) {
6399 PtrSCEVLowest = PtrSCEV;
6400 continue;
6401 }
6402 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6403 if (isa<SCEVCouldNotCompute>(Diff1))
6404 return nullptr;
6405 if (Diff1->isNonConstantNegative()) {
6406 PtrSCEVHighest = PtrSCEV;
6407 continue;
6408 }
6409 }
6410 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6411 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6412 if (isa<SCEVCouldNotCompute>(Dist))
6413 return nullptr;
6414 int Size = DL.getTypeStoreSize(ElemTy);
6415 auto TryGetStride = [&](const SCEV *Dist,
6416 const SCEV *Multiplier) -> const SCEV * {
6417 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6418 if (M->getOperand(0) == Multiplier)
6419 return M->getOperand(1);
6420 if (M->getOperand(1) == Multiplier)
6421 return M->getOperand(0);
6422 return nullptr;
6423 }
6424 if (Multiplier == Dist)
6425 return SE.getConstant(Dist->getType(), 1);
6426 return SE.getUDivExactExpr(Dist, Multiplier);
6427 };
6428 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6429 const SCEV *Stride = nullptr;
6430 if (Size != 1 || SCEVs.size() > 2) {
6431 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6432 Stride = TryGetStride(Dist, Sz);
6433 if (!Stride)
6434 return nullptr;
6435 }
6436 if (!Stride || isa<SCEVConstant>(Stride))
6437 return nullptr;
6438 // Iterate through all pointers and check if all distances are
6439 // unique multiple of Stride.
6440 using DistOrdPair = std::pair<int64_t, int>;
6441 auto Compare = llvm::less_first();
6442 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6443 int Cnt = 0;
6444 bool IsConsecutive = true;
6445 for (const SCEV *PtrSCEV : SCEVs) {
6446 unsigned Dist = 0;
6447 if (PtrSCEV != PtrSCEVLowest) {
6448 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6449 const SCEV *Coeff = TryGetStride(Diff, Stride);
6450 if (!Coeff)
6451 return nullptr;
6452 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6453 if (!SC || isa<SCEVCouldNotCompute>(SC))
6454 return nullptr;
6455 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6456 SE.getMulExpr(Stride, SC)))
6457 ->isZero())
6458 return nullptr;
6459 Dist = SC->getAPInt().getZExtValue();
6460 }
6461 // If the strides are not the same or repeated, we can't vectorize.
6462 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6463 return nullptr;
6464 auto Res = Offsets.emplace(Dist, Cnt);
6465 if (!Res.second)
6466 return nullptr;
6467 // Consecutive order if the inserted element is the last one.
6468 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6469 ++Cnt;
6470 }
6471 if (Offsets.size() != SCEVs.size())
6472 return nullptr;
6473 SortedIndices.clear();
6474 if (!IsConsecutive) {
6475 // Fill SortedIndices array only if it is non-consecutive.
6476 SortedIndices.resize(PointerOps.size());
6477 Cnt = 0;
6478 for (const std::pair<int64_t, int> &Pair : Offsets) {
6479 SortedIndices[Cnt] = Pair.second;
6480 ++Cnt;
6481 }
6482 }
6483 return Stride;
6484}
6485
6486static std::pair<InstructionCost, InstructionCost>
6487getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
6488 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6489 Type *ScalarTy, VectorType *VecTy);
6490
6491/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6492/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6493/// subvector pattern.
6494static InstructionCost
6496 VectorType *Tp, ArrayRef<int> Mask = {},
6498 int Index = 0, VectorType *SubTp = nullptr,
6500 VectorType *DstTy = Tp;
6501 if (!Mask.empty())
6502 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6503
6504 if (Kind != TTI::SK_PermuteTwoSrc)
6505 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6506 Args);
6507 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6508 int NumSubElts;
6510 Mask, NumSrcElts, NumSubElts, Index)) {
6511 if (Index + NumSubElts > NumSrcElts &&
6512 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6513 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6514 TTI::TCK_RecipThroughput, Index, Tp);
6515 }
6516 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6517 Args);
6518}
6519
6520/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6521/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6522/// instead of a scalar.
6523static InstructionCost
6525 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6526 bool Extract, TTI::TargetCostKind CostKind,
6527 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6529 "ScalableVectorType is not supported.");
6530 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6531 getNumElements(Ty) &&
6532 "Incorrect usage.");
6533 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6534 assert(SLPReVec && "Only supported by REVEC.");
6535 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6536 // of CreateInsertElement.
6537 unsigned ScalarTyNumElements = VecTy->getNumElements();
6539 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6540 if (!DemandedElts[I])
6541 continue;
6542 if (Insert)
6544 I * ScalarTyNumElements, VecTy);
6545 if (Extract)
6547 I * ScalarTyNumElements, VecTy);
6548 }
6549 return Cost;
6550 }
6551 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6552 CostKind, ForPoisonSrc, VL);
6553}
6554
6555/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6556/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6558 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6559 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6560 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6561 if (Opcode == Instruction::ExtractElement) {
6562 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6563 assert(SLPReVec && "Only supported by REVEC.");
6564 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6566 cast<VectorType>(Val), {}, CostKind,
6567 Index * VecTy->getNumElements(), VecTy);
6568 }
6569 }
6570 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6571 ScalarUserAndIdx);
6572}
6573
6574/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6575/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6577 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6578 VectorType *VecTy, unsigned Index,
6580 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6581 assert(SLPReVec && "Only supported by REVEC.");
6582 auto *SubTp =
6583 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6585 Index * ScalarTy->getNumElements(), SubTp) +
6586 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6587 CostKind);
6588 }
6589 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6590}
6591
6592/// Creates subvector insert. Generates shuffle using \p Generator or
6593/// using default shuffle.
6595 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6596 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6597 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6598 return Vec;
6599 const unsigned SubVecVF = getNumElements(V->getType());
6600 // Create shuffle, insertvector requires that index is multiple of
6601 // the subvector length.
6602 const unsigned VecVF = getNumElements(Vec->getType());
6603 SmallVector<int> Mask(VecVF, PoisonMaskElem);
6604 if (isa<PoisonValue>(Vec)) {
6605 auto *Begin = std::next(Mask.begin(), Index);
6606 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6607 Vec = Builder.CreateShuffleVector(V, Mask);
6608 return Vec;
6609 }
6610 std::iota(Mask.begin(), Mask.end(), 0);
6611 std::iota(std::next(Mask.begin(), Index),
6612 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6613 if (Generator)
6614 return Generator(Vec, V, Mask);
6615 // 1. Resize V to the size of Vec.
6616 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6617 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6618 V = Builder.CreateShuffleVector(V, ResizeMask);
6619 // 2. Insert V into Vec.
6620 return Builder.CreateShuffleVector(Vec, V, Mask);
6621}
6622
6623/// Generates subvector extract using \p Generator or using default shuffle.
6625 unsigned SubVecVF, unsigned Index) {
6626 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6627 std::iota(Mask.begin(), Mask.end(), Index);
6628 return Builder.CreateShuffleVector(Vec, Mask);
6629}
6630
6631/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6632/// with \p Order.
6633/// \return true if the mask represents strided access, false - otherwise.
6635 ArrayRef<unsigned> Order, Type *ScalarTy,
6636 const DataLayout &DL, ScalarEvolution &SE,
6637 SmallVectorImpl<int> &CompressMask) {
6638 const unsigned Sz = PointerOps.size();
6639 CompressMask.assign(Sz, PoisonMaskElem);
6640 // The first element always set.
6641 CompressMask[0] = 0;
6642 // Check if the mask represents strided access.
6643 std::optional<unsigned> Stride = 0;
6644 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6645 for (unsigned I : seq<unsigned>(1, Sz)) {
6646 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6647 std::optional<int64_t> OptPos =
6648 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6649 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6650 return false;
6651 unsigned Pos = static_cast<unsigned>(*OptPos);
6652 CompressMask[I] = Pos;
6653 if (!Stride)
6654 continue;
6655 if (*Stride == 0) {
6656 *Stride = Pos;
6657 continue;
6658 }
6659 if (Pos != *Stride * I)
6660 Stride.reset();
6661 }
6662 return Stride.has_value();
6663}
6664
6665/// Checks if the \p VL can be transformed to a (masked)load + compress or
6666/// (masked) interleaved load.
6668 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6671 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6672 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6673 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6674 VectorType *&LoadVecTy) {
6675 InterleaveFactor = 0;
6676 Type *ScalarTy = VL.front()->getType();
6677 const size_t Sz = VL.size();
6678 auto *VecTy = getWidenedType(ScalarTy, Sz);
6680 SmallVector<int> Mask;
6681 if (!Order.empty())
6682 inversePermutation(Order, Mask);
6683 // Check external uses.
6684 for (const auto [I, V] : enumerate(VL)) {
6685 if (AreAllUsersVectorized(V))
6686 continue;
6687 InstructionCost ExtractCost =
6688 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6689 Mask.empty() ? I : Mask[I]);
6690 InstructionCost ScalarCost =
6691 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6692 if (ExtractCost <= ScalarCost)
6693 return false;
6694 }
6695 Value *Ptr0;
6696 Value *PtrN;
6697 if (Order.empty()) {
6698 Ptr0 = PointerOps.front();
6699 PtrN = PointerOps.back();
6700 } else {
6701 Ptr0 = PointerOps[Order.front()];
6702 PtrN = PointerOps[Order.back()];
6703 }
6704 std::optional<int64_t> Diff =
6705 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6706 if (!Diff)
6707 return false;
6708 const size_t MaxRegSize =
6710 .getFixedValue();
6711 // Check for very large distances between elements.
6712 if (*Diff / Sz >= MaxRegSize / 8)
6713 return false;
6714 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6715 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6716 Align CommonAlignment = LI->getAlign();
6717 IsMasked = !isSafeToLoadUnconditionally(
6718 Ptr0, LoadVecTy, CommonAlignment, DL,
6719 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6720 &TLI);
6721 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6722 LI->getPointerAddressSpace()))
6723 return false;
6724 // TODO: perform the analysis of each scalar load for better
6725 // safe-load-unconditionally analysis.
6726 bool IsStrided =
6727 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6728 assert(CompressMask.size() >= 2 && "At least two elements are required");
6729 SmallVector<Value *> OrderedPointerOps(PointerOps);
6730 if (!Order.empty())
6731 reorderScalars(OrderedPointerOps, Mask);
6732 auto [ScalarGEPCost, VectorGEPCost] =
6733 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6734 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6735 // The cost of scalar loads.
6736 InstructionCost ScalarLoadsCost =
6737 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6738 [&](InstructionCost C, Value *V) {
6739 return C + TTI.getInstructionCost(cast<Instruction>(V),
6740 CostKind);
6741 }) +
6742 ScalarGEPCost;
6743 APInt DemandedElts = APInt::getAllOnes(Sz);
6744 InstructionCost GatherCost =
6745 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6746 /*Insert=*/true,
6747 /*Extract=*/false, CostKind) +
6748 ScalarLoadsCost;
6749 InstructionCost LoadCost = 0;
6750 if (IsMasked) {
6751 LoadCost =
6752 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6753 LI->getPointerAddressSpace(), CostKind);
6754 } else {
6755 LoadCost =
6756 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6757 LI->getPointerAddressSpace(), CostKind);
6758 }
6759 if (IsStrided && !IsMasked && Order.empty()) {
6760 // Check for potential segmented(interleaved) loads.
6761 VectorType *AlignedLoadVecTy = getWidenedType(
6762 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6763 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6764 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6765 &TLI))
6766 AlignedLoadVecTy = LoadVecTy;
6767 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6768 CommonAlignment,
6769 LI->getPointerAddressSpace())) {
6770 InstructionCost InterleavedCost =
6771 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6772 Instruction::Load, AlignedLoadVecTy,
6773 CompressMask[1], {}, CommonAlignment,
6774 LI->getPointerAddressSpace(), CostKind, IsMasked);
6775 if (InterleavedCost < GatherCost) {
6776 InterleaveFactor = CompressMask[1];
6777 LoadVecTy = AlignedLoadVecTy;
6778 return true;
6779 }
6780 }
6781 }
6782 InstructionCost CompressCost = ::getShuffleCost(
6783 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
6784 if (!Order.empty()) {
6785 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6786 for (unsigned I : seq<unsigned>(Sz)) {
6787 NewMask[I] = CompressMask[Mask[I]];
6788 }
6789 CompressMask.swap(NewMask);
6790 }
6791 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6792 return TotalVecCost < GatherCost;
6793}
6794
6795/// Checks if the \p VL can be transformed to a (masked)load + compress or
6796/// (masked) interleaved load.
6797static bool
6800 const DataLayout &DL, ScalarEvolution &SE,
6801 AssumptionCache &AC, const DominatorTree &DT,
6802 const TargetLibraryInfo &TLI,
6803 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6804 bool IsMasked;
6805 unsigned InterleaveFactor;
6806 SmallVector<int> CompressMask;
6807 VectorType *LoadVecTy;
6808 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6809 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6810 CompressMask, LoadVecTy);
6811}
6812
6813/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6814/// PointerOps:
6815/// 1. Target with strided load support is detected.
6816/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6817/// potential stride <= MaxProfitableLoadStride and the potential stride is
6818/// power-of-2 (to avoid perf regressions for the very small number of loads)
6819/// and max distance > number of loads, or potential stride is -1.
6820/// 3. The loads are ordered, or number of unordered loads <=
6821/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6822/// to avoid extra costs for very expensive shuffles).
6823/// 4. Any pointer operand is an instruction with the users outside of the
6824/// current graph (for masked gathers extra extractelement instructions
6825/// might be required).
6827 Align Alignment, const int64_t Diff, Value *Ptr0,
6828 Value *PtrN, StridedPtrInfo &SPtrInfo) const {
6829 const size_t Sz = PointerOps.size();
6830 if (Diff % (Sz - 1) != 0)
6831 return false;
6832
6833 // Try to generate strided load node.
6834 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
6835 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
6836 return !isVectorized(U) && !MustGather.contains(U);
6837 });
6838 });
6839
6840 const uint64_t AbsoluteDiff = std::abs(Diff);
6841 auto *VecTy = getWidenedType(ScalarTy, Sz);
6842 if (IsAnyPointerUsedOutGraph ||
6843 (AbsoluteDiff > Sz &&
6845 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
6846 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
6847 Diff == -(static_cast<int64_t>(Sz) - 1)) {
6848 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6849 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
6850 return false;
6851 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
6852 return false;
6853
6854 // Iterate through all pointers and check if all distances are
6855 // unique multiple of Dist.
6857 for (Value *Ptr : PointerOps) {
6858 int64_t Dist = 0;
6859 if (Ptr == PtrN)
6860 Dist = Diff;
6861 else if (Ptr != Ptr0)
6862 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
6863 // If the strides are not the same or repeated, we can't
6864 // vectorize.
6865 if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
6866 break;
6867 }
6868 if (Dists.size() == Sz) {
6869 Type *StrideTy = DL->getIndexType(Ptr0->getType());
6870 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6871 SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
6872 return true;
6873 }
6874 }
6875 return false;
6876}
6877
6879 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
6880 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
6881 unsigned *BestVF, bool TryRecursiveCheck) const {
6882 // Check that a vectorized load would load the same memory as a scalar
6883 // load. For example, we don't want to vectorize loads that are smaller
6884 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6885 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6886 // from such a struct, we read/write packed bits disagreeing with the
6887 // unvectorized version.
6888 if (BestVF)
6889 *BestVF = 0;
6891 return LoadsState::Gather;
6892 Type *ScalarTy = VL0->getType();
6893
6894 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
6895 return LoadsState::Gather;
6896
6897 // Make sure all loads in the bundle are simple - we can't vectorize
6898 // atomic or volatile loads.
6899 PointerOps.clear();
6900 const size_t Sz = VL.size();
6901 PointerOps.resize(Sz);
6902 auto *POIter = PointerOps.begin();
6903 for (Value *V : VL) {
6904 auto *L = dyn_cast<LoadInst>(V);
6905 if (!L || !L->isSimple())
6906 return LoadsState::Gather;
6907 *POIter = L->getPointerOperand();
6908 ++POIter;
6909 }
6910
6911 Order.clear();
6912 // Check the order of pointer operands or that all pointers are the same.
6913 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
6914
6915 auto *VecTy = getWidenedType(ScalarTy, Sz);
6916 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
6917 if (!IsSorted) {
6918 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
6919 if (const SCEV *Stride =
6920 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
6921 Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
6922 SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
6923 SPtrInfo.StrideSCEV = Stride;
6925 }
6926 }
6927
6928 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6929 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6930 return LoadsState::Gather;
6931
6932 if (!all_of(PointerOps, [&](Value *P) {
6933 return arePointersCompatible(P, PointerOps.front(), *TLI);
6934 }))
6935 return LoadsState::Gather;
6936
6937 } else {
6938 Value *Ptr0;
6939 Value *PtrN;
6940 if (Order.empty()) {
6941 Ptr0 = PointerOps.front();
6942 PtrN = PointerOps.back();
6943 } else {
6944 Ptr0 = PointerOps[Order.front()];
6945 PtrN = PointerOps[Order.back()];
6946 }
6947 std::optional<int64_t> Diff =
6948 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6949 // Check that the sorted loads are consecutive.
6950 if (static_cast<uint64_t>(*Diff) == Sz - 1)
6951 return LoadsState::Vectorize;
6952 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
6953 *TLI, [&](Value *V) {
6954 return areAllUsersVectorized(
6955 cast<Instruction>(V), UserIgnoreList);
6956 }))
6958 Align Alignment =
6959 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
6960 ->getAlign();
6961 if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN,
6962 SPtrInfo))
6964 }
6965 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
6966 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
6967 return LoadsState::Gather;
6968 // Correctly identify compare the cost of loads + shuffles rather than
6969 // strided/masked gather loads. Returns true if vectorized + shuffles
6970 // representation is better than just gather.
6971 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
6972 unsigned *BestVF,
6973 bool ProfitableGatherPointers) {
6974 if (BestVF)
6975 *BestVF = 0;
6976 // Compare masked gather cost and loads + insert subvector costs.
6978 auto [ScalarGEPCost, VectorGEPCost] =
6979 getGEPCosts(TTI, PointerOps, PointerOps.front(),
6980 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
6981 // Estimate the cost of masked gather GEP. If not a splat, roughly
6982 // estimate as a buildvector, otherwise estimate as splat.
6983 APInt DemandedElts = APInt::getAllOnes(Sz);
6984 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
6985 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
6986 if (static_cast<unsigned>(count_if(
6987 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
6988 any_of(PointerOps, [&](Value *V) {
6989 return getUnderlyingObject(V) !=
6990 getUnderlyingObject(PointerOps.front());
6991 }))
6992 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
6993 DemandedElts, /*Insert=*/true,
6994 /*Extract=*/false, CostKind);
6995 else
6996 VectorGEPCost +=
6998 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
6999 /*Insert=*/true, /*Extract=*/false, CostKind) +
7000 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7001 // The cost of scalar loads.
7002 InstructionCost ScalarLoadsCost =
7003 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7004 [&](InstructionCost C, Value *V) {
7005 return C + TTI.getInstructionCost(
7007 }) +
7008 ScalarGEPCost;
7009 // The cost of masked gather.
7010 InstructionCost MaskedGatherCost =
7011 TTI.getGatherScatterOpCost(
7012 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
7013 /*VariableMask=*/false, CommonAlignment, CostKind) +
7014 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7015 InstructionCost GatherCost =
7016 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7017 /*Insert=*/true,
7018 /*Extract=*/false, CostKind) +
7019 ScalarLoadsCost;
7020 // The list of loads is small or perform partial check already - directly
7021 // compare masked gather cost and gather cost.
7022 constexpr unsigned ListLimit = 4;
7023 if (!TryRecursiveCheck || VL.size() < ListLimit)
7024 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7025
7026 // FIXME: The following code has not been updated for non-power-of-2
7027 // vectors (and not whole registers). The splitting logic here does not
7028 // cover the original vector if the vector factor is not a power of two.
7029 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7030 return false;
7031
7032 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7033 unsigned MinVF = getMinVF(2 * Sz);
7034 DemandedElts.clearAllBits();
7035 // Iterate through possible vectorization factors and check if vectorized +
7036 // shuffles is better than just gather.
7037 for (unsigned VF =
7038 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7039 VF >= MinVF;
7040 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7042 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7043 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7045 SmallVector<Value *> PointerOps;
7046 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7047 PointerOps, SPtrInfo, BestVF,
7048 /*TryRecursiveCheck=*/false);
7049 // Check that the sorted loads are consecutive.
7050 if (LS == LoadsState::Gather) {
7051 if (BestVF) {
7052 DemandedElts.setAllBits();
7053 break;
7054 }
7055 DemandedElts.setBits(Cnt, Cnt + VF);
7056 continue;
7057 }
7058 // If need the reorder - consider as high-cost masked gather for now.
7059 if ((LS == LoadsState::Vectorize ||
7062 !Order.empty() && !isReverseOrder(Order))
7064 States.push_back(LS);
7065 }
7066 if (DemandedElts.isAllOnes())
7067 // All loads gathered - try smaller VF.
7068 continue;
7069 // Can be vectorized later as a serie of loads/insertelements.
7070 InstructionCost VecLdCost = 0;
7071 if (!DemandedElts.isZero()) {
7072 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7073 /*Insert=*/true,
7074 /*Extract=*/false, CostKind) +
7075 ScalarGEPCost;
7076 for (unsigned Idx : seq<unsigned>(VL.size()))
7077 if (DemandedElts[Idx])
7078 VecLdCost +=
7079 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7080 }
7081 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7082 for (auto [I, LS] : enumerate(States)) {
7083 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7084 InstructionCost VectorGEPCost =
7085 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7086 ? 0
7087 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7088 LI0->getPointerOperand(),
7089 Instruction::GetElementPtr, CostKind, ScalarTy,
7090 SubVecTy)
7091 .second;
7092 if (LS == LoadsState::ScatterVectorize) {
7093 if (static_cast<unsigned>(
7094 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7095 PointerOps.size() - 1 ||
7096 any_of(PointerOps, [&](Value *V) {
7097 return getUnderlyingObject(V) !=
7098 getUnderlyingObject(PointerOps.front());
7099 }))
7100 VectorGEPCost += getScalarizationOverhead(
7101 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7102 /*Insert=*/true, /*Extract=*/false, CostKind);
7103 else
7104 VectorGEPCost +=
7106 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7107 /*Insert=*/true, /*Extract=*/false, CostKind) +
7108 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7109 CostKind);
7110 }
7111 switch (LS) {
7113 VecLdCost +=
7114 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7115 LI0->getPointerAddressSpace(), CostKind,
7117 VectorGEPCost;
7118 break;
7120 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7121 LI0->getPointerOperand(),
7122 /*VariableMask=*/false,
7123 CommonAlignment, CostKind) +
7124 VectorGEPCost;
7125 break;
7127 VecLdCost += TTI.getMaskedMemoryOpCost(
7128 Instruction::Load, SubVecTy, CommonAlignment,
7129 LI0->getPointerAddressSpace(), CostKind) +
7130 VectorGEPCost +
7132 {}, CostKind);
7133 break;
7135 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7136 LI0->getPointerOperand(),
7137 /*VariableMask=*/false,
7138 CommonAlignment, CostKind) +
7139 VectorGEPCost;
7140 break;
7141 case LoadsState::Gather:
7142 // Gathers are already calculated - ignore.
7143 continue;
7144 }
7145 SmallVector<int> ShuffleMask(VL.size());
7146 for (int Idx : seq<int>(0, VL.size()))
7147 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7148 if (I > 0)
7149 VecLdCost +=
7150 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7151 CostKind, I * VF, SubVecTy);
7152 }
7153 // If masked gather cost is higher - better to vectorize, so
7154 // consider it as a gather node. It will be better estimated
7155 // later.
7156 if (MaskedGatherCost >= VecLdCost &&
7157 VecLdCost - GatherCost < -SLPCostThreshold) {
7158 if (BestVF)
7159 *BestVF = VF;
7160 return true;
7161 }
7162 }
7163 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7164 };
7165 // TODO: need to improve analysis of the pointers, if not all of them are
7166 // GEPs or have > 2 operands, we end up with a gather node, which just
7167 // increases the cost.
7168 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7169 bool ProfitableGatherPointers =
7170 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7171 return L->isLoopInvariant(V);
7172 })) <= Sz / 2;
7173 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7175 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7176 (GEP && GEP->getNumOperands() == 2 &&
7177 isa<Constant, Instruction>(GEP->getOperand(1)));
7178 })) {
7179 // Check if potential masked gather can be represented as series
7180 // of loads + insertsubvectors.
7181 // If masked gather cost is higher - better to vectorize, so
7182 // consider it as a gather node. It will be better estimated
7183 // later.
7184 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7185 ProfitableGatherPointers))
7187 }
7188
7189 return LoadsState::Gather;
7190}
7191
7193 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7194 const DataLayout &DL, ScalarEvolution &SE,
7195 SmallVectorImpl<unsigned> &SortedIndices) {
7196 assert(
7197 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7198 "Expected list of pointer operands.");
7199 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7200 // Ptr into, sort and return the sorted indices with values next to one
7201 // another.
7203 std::pair<BasicBlock *, Value *>,
7205 Bases;
7206 Bases
7207 .try_emplace(std::make_pair(
7209 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7210
7211 SortedIndices.clear();
7212 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7213 auto Key = std::make_pair(BBs[Cnt + 1],
7215 bool Found = any_of(Bases.try_emplace(Key).first->second,
7216 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7217 std::optional<int64_t> Diff =
7218 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7219 ElemTy, Ptr, DL, SE,
7220 /*StrictCheck=*/true);
7221 if (!Diff)
7222 return false;
7223
7224 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7225 return true;
7226 });
7227
7228 if (!Found) {
7229 // If we haven't found enough to usefully cluster, return early.
7230 if (Bases.size() > VL.size() / 2 - 1)
7231 return false;
7232
7233 // Not found already - add a new Base
7234 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7235 }
7236 }
7237
7238 if (Bases.size() == VL.size())
7239 return false;
7240
7241 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7242 Bases.front().second.size() == VL.size()))
7243 return false;
7244
7245 // For each of the bases sort the pointers by Offset and check if any of the
7246 // base become consecutively allocated.
7247 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7248 SmallPtrSet<Value *, 13> FirstPointers;
7249 SmallPtrSet<Value *, 13> SecondPointers;
7250 Value *P1 = Ptr1;
7251 Value *P2 = Ptr2;
7252 unsigned Depth = 0;
7253 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7254 if (P1 == P2 || Depth > RecursionMaxDepth)
7255 return false;
7256 FirstPointers.insert(P1);
7257 SecondPointers.insert(P2);
7258 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7259 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7260 ++Depth;
7261 }
7262 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7263 "Unable to find matching root.");
7264 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7265 };
7266 for (auto &Base : Bases) {
7267 for (auto &Vec : Base.second) {
7268 if (Vec.size() > 1) {
7270 int64_t InitialOffset = std::get<1>(Vec[0]);
7271 bool AnyConsecutive =
7272 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7273 return std::get<1>(P.value()) ==
7274 int64_t(P.index()) + InitialOffset;
7275 });
7276 // Fill SortedIndices array only if it looks worth-while to sort the
7277 // ptrs.
7278 if (!AnyConsecutive)
7279 return false;
7280 }
7281 }
7282 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7283 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7284 });
7285 }
7286
7287 for (auto &T : Bases)
7288 for (const auto &Vec : T.second)
7289 for (const auto &P : Vec)
7290 SortedIndices.push_back(std::get<2>(P));
7291
7292 assert(SortedIndices.size() == VL.size() &&
7293 "Expected SortedIndices to be the size of VL");
7294 return true;
7295}
7296
7297std::optional<BoUpSLP::OrdersType>
7298BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7299 assert(TE.isGather() && "Expected gather node only.");
7300 Type *ScalarTy = TE.Scalars[0]->getType();
7301
7303 Ptrs.reserve(TE.Scalars.size());
7305 BBs.reserve(TE.Scalars.size());
7306 for (Value *V : TE.Scalars) {
7307 auto *L = dyn_cast<LoadInst>(V);
7308 if (!L || !L->isSimple())
7309 return std::nullopt;
7310 Ptrs.push_back(L->getPointerOperand());
7311 BBs.push_back(L->getParent());
7312 }
7313
7314 BoUpSLP::OrdersType Order;
7315 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7316 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7317 return std::move(Order);
7318 return std::nullopt;
7319}
7320
7321/// Check if two insertelement instructions are from the same buildvector.
7324 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7325 // Instructions must be from the same basic blocks.
7326 if (VU->getParent() != V->getParent())
7327 return false;
7328 // Checks if 2 insertelements are from the same buildvector.
7329 if (VU->getType() != V->getType())
7330 return false;
7331 // Multiple used inserts are separate nodes.
7332 if (!VU->hasOneUse() && !V->hasOneUse())
7333 return false;
7334 auto *IE1 = VU;
7335 auto *IE2 = V;
7336 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7337 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7338 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7339 return false;
7340 // Go through the vector operand of insertelement instructions trying to find
7341 // either VU as the original vector for IE2 or V as the original vector for
7342 // IE1.
7343 SmallBitVector ReusedIdx(
7344 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7345 bool IsReusedIdx = false;
7346 do {
7347 if (IE2 == VU && !IE1)
7348 return VU->hasOneUse();
7349 if (IE1 == V && !IE2)
7350 return V->hasOneUse();
7351 if (IE1 && IE1 != V) {
7352 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7353 IsReusedIdx |= ReusedIdx.test(Idx1);
7354 ReusedIdx.set(Idx1);
7355 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7356 IE1 = nullptr;
7357 else
7358 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7359 }
7360 if (IE2 && IE2 != VU) {
7361 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7362 IsReusedIdx |= ReusedIdx.test(Idx2);
7363 ReusedIdx.set(Idx2);
7364 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7365 IE2 = nullptr;
7366 else
7367 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7368 }
7369 } while (!IsReusedIdx && (IE1 || IE2));
7370 return false;
7371}
7372
7373/// Checks if the specified instruction \p I is an alternate operation for
7374/// the given \p MainOp and \p AltOp instructions.
7375static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7376 Instruction *AltOp,
7377 const TargetLibraryInfo &TLI);
7378
7379std::optional<BoUpSLP::OrdersType>
7380BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7381 bool IgnoreReorder) {
7382 // No need to reorder if need to shuffle reuses, still need to shuffle the
7383 // node.
7384 if (!TE.ReuseShuffleIndices.empty()) {
7385 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7386 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7387 "Reshuffling scalars not yet supported for nodes with padding");
7388
7389 if (isSplat(TE.Scalars))
7390 return std::nullopt;
7391 // Check if reuse shuffle indices can be improved by reordering.
7392 // For this, check that reuse mask is "clustered", i.e. each scalar values
7393 // is used once in each submask of size <number_of_scalars>.
7394 // Example: 4 scalar values.
7395 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7396 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7397 // element 3 is used twice in the second submask.
7398 unsigned Sz = TE.Scalars.size();
7399 if (TE.isGather()) {
7400 if (std::optional<OrdersType> CurrentOrder =
7401 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7402 SmallVector<int> Mask;
7403 fixupOrderingIndices(*CurrentOrder);
7404 inversePermutation(*CurrentOrder, Mask);
7405 ::addMask(Mask, TE.ReuseShuffleIndices);
7406 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7407 unsigned Sz = TE.Scalars.size();
7408 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7409 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7410 if (Idx != PoisonMaskElem)
7411 Res[Idx + K * Sz] = I + K * Sz;
7412 }
7413 return std::move(Res);
7414 }
7415 }
7416 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7417 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7418 2 * TE.getVectorFactor())) == 1)
7419 return std::nullopt;
7420 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7421 return std::nullopt;
7422 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7423 Sz)) {
7424 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7425 if (TE.ReorderIndices.empty())
7426 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7427 else
7428 inversePermutation(TE.ReorderIndices, ReorderMask);
7429 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7430 unsigned VF = ReorderMask.size();
7431 OrdersType ResOrder(VF, VF);
7432 unsigned NumParts = divideCeil(VF, Sz);
7433 SmallBitVector UsedVals(NumParts);
7434 for (unsigned I = 0; I < VF; I += Sz) {
7435 int Val = PoisonMaskElem;
7436 unsigned UndefCnt = 0;
7437 unsigned Limit = std::min(Sz, VF - I);
7438 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7439 [&](int Idx) {
7440 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7441 Val = Idx;
7442 if (Idx == PoisonMaskElem)
7443 ++UndefCnt;
7444 return Idx != PoisonMaskElem && Idx != Val;
7445 }) ||
7446 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7447 UndefCnt > Sz / 2)
7448 return std::nullopt;
7449 UsedVals.set(Val);
7450 for (unsigned K = 0; K < NumParts; ++K) {
7451 unsigned Idx = Val + Sz * K;
7452 if (Idx < VF && I + K < VF)
7453 ResOrder[Idx] = I + K;
7454 }
7455 }
7456 return std::move(ResOrder);
7457 }
7458 unsigned VF = TE.getVectorFactor();
7459 // Try build correct order for extractelement instructions.
7460 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7461 TE.ReuseShuffleIndices.end());
7462 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7463 all_of(TE.Scalars, [Sz](Value *V) {
7464 if (isa<PoisonValue>(V))
7465 return true;
7466 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7467 return Idx && *Idx < Sz;
7468 })) {
7469 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7470 "by BinaryOperator and CastInst.");
7471 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7472 if (TE.ReorderIndices.empty())
7473 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7474 else
7475 inversePermutation(TE.ReorderIndices, ReorderMask);
7476 for (unsigned I = 0; I < VF; ++I) {
7477 int &Idx = ReusedMask[I];
7478 if (Idx == PoisonMaskElem)
7479 continue;
7480 Value *V = TE.Scalars[ReorderMask[Idx]];
7481 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7482 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7483 }
7484 }
7485 // Build the order of the VF size, need to reorder reuses shuffles, they are
7486 // always of VF size.
7487 OrdersType ResOrder(VF);
7488 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7489 auto *It = ResOrder.begin();
7490 for (unsigned K = 0; K < VF; K += Sz) {
7491 OrdersType CurrentOrder(TE.ReorderIndices);
7492 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7493 if (SubMask.front() == PoisonMaskElem)
7494 std::iota(SubMask.begin(), SubMask.end(), 0);
7495 reorderOrder(CurrentOrder, SubMask);
7496 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7497 std::advance(It, Sz);
7498 }
7499 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7500 return Data.index() == Data.value();
7501 }))
7502 return std::nullopt; // No need to reorder.
7503 return std::move(ResOrder);
7504 }
7505 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7506 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7507 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
7508 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
7509 return std::nullopt;
7510 if (TE.State == TreeEntry::SplitVectorize ||
7511 ((TE.State == TreeEntry::Vectorize ||
7512 TE.State == TreeEntry::StridedVectorize ||
7513 TE.State == TreeEntry::CompressVectorize) &&
7515 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7516 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7517 "Alternate instructions are only supported by "
7518 "BinaryOperator and CastInst.");
7519 return TE.ReorderIndices;
7520 }
7521 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7522 TE.isAltShuffle()) {
7523 assert(TE.ReuseShuffleIndices.empty() &&
7524 "ReuseShuffleIndices should be "
7525 "empty for alternate instructions.");
7526 SmallVector<int> Mask;
7527 TE.buildAltOpShuffleMask(
7528 [&](Instruction *I) {
7529 assert(TE.getMatchingMainOpOrAltOp(I) &&
7530 "Unexpected main/alternate opcode");
7531 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
7532 },
7533 Mask);
7534 const int VF = TE.getVectorFactor();
7535 OrdersType ResOrder(VF, VF);
7536 for (unsigned I : seq<unsigned>(VF)) {
7537 if (Mask[I] == PoisonMaskElem)
7538 continue;
7539 ResOrder[Mask[I] % VF] = I;
7540 }
7541 return std::move(ResOrder);
7542 }
7543 if (!TE.ReorderIndices.empty())
7544 return TE.ReorderIndices;
7545 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7546 if (!TE.ReorderIndices.empty())
7547 return TE.ReorderIndices;
7548
7549 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
7550 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
7551 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7552 continue;
7553 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
7554 if (!II)
7555 continue;
7556 Instruction *BVHead = nullptr;
7557 BasicBlock *BB = II->getParent();
7558 while (II && II->hasOneUse() && II->getParent() == BB) {
7559 BVHead = II;
7560 II = dyn_cast<InsertElementInst>(II->getOperand(0));
7561 }
7562 I = BVHead;
7563 }
7564
7565 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
7566 assert(BB1 != BB2 && "Expected different basic blocks.");
7567 if (!DT->isReachableFromEntry(BB1))
7568 return false;
7569 if (!DT->isReachableFromEntry(BB2))
7570 return true;
7571 auto *NodeA = DT->getNode(BB1);
7572 auto *NodeB = DT->getNode(BB2);
7573 assert(NodeA && "Should only process reachable instructions");
7574 assert(NodeB && "Should only process reachable instructions");
7575 assert((NodeA == NodeB) ==
7576 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7577 "Different nodes should have different DFS numbers");
7578 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7579 };
7580 auto PHICompare = [&](unsigned I1, unsigned I2) {
7581 Value *V1 = TE.Scalars[I1];
7582 Value *V2 = TE.Scalars[I2];
7583 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
7584 return false;
7585 if (isa<PoisonValue>(V1))
7586 return true;
7587 if (isa<PoisonValue>(V2))
7588 return false;
7589 if (V1->getNumUses() < V2->getNumUses())
7590 return true;
7591 if (V1->getNumUses() > V2->getNumUses())
7592 return false;
7593 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
7594 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
7595 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7596 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7597 FirstUserOfPhi2->getParent());
7598 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7599 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7600 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7601 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7602 if (IE1 && !IE2)
7603 return true;
7604 if (!IE1 && IE2)
7605 return false;
7606 if (IE1 && IE2) {
7607 if (UserBVHead[I1] && !UserBVHead[I2])
7608 return true;
7609 if (!UserBVHead[I1])
7610 return false;
7611 if (UserBVHead[I1] == UserBVHead[I2])
7612 return getElementIndex(IE1) < getElementIndex(IE2);
7613 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
7614 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
7615 UserBVHead[I2]->getParent());
7616 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7617 }
7618 if (EE1 && !EE2)
7619 return true;
7620 if (!EE1 && EE2)
7621 return false;
7622 if (EE1 && EE2) {
7623 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7624 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7625 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7626 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7627 if (!Inst2 && !P2)
7628 return Inst1 || P1;
7629 if (EE1->getOperand(0) == EE2->getOperand(0))
7630 return getElementIndex(EE1) < getElementIndex(EE2);
7631 if (!Inst1 && Inst2)
7632 return false;
7633 if (Inst1 && Inst2) {
7634 if (Inst1->getParent() != Inst2->getParent())
7635 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
7636 return Inst1->comesBefore(Inst2);
7637 }
7638 if (!P1 && P2)
7639 return false;
7640 assert(P1 && P2 &&
7641 "Expected either instructions or arguments vector operands.");
7642 return P1->getArgNo() < P2->getArgNo();
7643 }
7644 return false;
7645 };
7646 OrdersType Phis(TE.Scalars.size());
7647 std::iota(Phis.begin(), Phis.end(), 0);
7648 stable_sort(Phis, PHICompare);
7649 if (isIdentityOrder(Phis))
7650 return std::nullopt; // No need to reorder.
7651 return std::move(Phis);
7652 }
7653 if (TE.isGather() &&
7654 (!TE.hasState() || !TE.isAltShuffle() ||
7655 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7656 allSameType(TE.Scalars)) {
7657 // TODO: add analysis of other gather nodes with extractelement
7658 // instructions and other values/instructions, not only undefs.
7659 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7661 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7662 all_of(TE.Scalars, [](Value *V) {
7663 auto *EE = dyn_cast<ExtractElementInst>(V);
7664 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7665 })) {
7666 // Check that gather of extractelements can be represented as
7667 // just a shuffle of a single vector.
7668 OrdersType CurrentOrder;
7669 bool Reuse =
7670 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
7671 if (Reuse || !CurrentOrder.empty())
7672 return std::move(CurrentOrder);
7673 }
7674 // If the gather node is <undef, v, .., poison> and
7675 // insertelement poison, v, 0 [+ permute]
7676 // is cheaper than
7677 // insertelement poison, v, n - try to reorder.
7678 // If rotating the whole graph, exclude the permute cost, the whole graph
7679 // might be transformed.
7680 int Sz = TE.Scalars.size();
7681 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
7682 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7683 const auto *It = find_if_not(TE.Scalars, isConstant);
7684 if (It == TE.Scalars.begin())
7685 return OrdersType();
7686 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
7687 if (It != TE.Scalars.end()) {
7688 OrdersType Order(Sz, Sz);
7689 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7690 Order[Idx] = 0;
7691 fixupOrderingIndices(Order);
7692 SmallVector<int> Mask;
7693 inversePermutation(Order, Mask);
7694 InstructionCost PermuteCost =
7695 TopToBottom
7696 ? 0
7697 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
7698 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
7699 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
7700 PoisonValue::get(Ty), *It);
7701 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
7702 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
7703 PoisonValue::get(Ty), *It);
7704 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7705 OrdersType Order(Sz, Sz);
7706 Order[Idx] = 0;
7707 return std::move(Order);
7708 }
7709 }
7710 }
7711 if (isSplat(TE.Scalars))
7712 return std::nullopt;
7713 if (TE.Scalars.size() >= 3)
7714 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
7715 return Order;
7716 // Check if can include the order of vectorized loads. For masked gathers do
7717 // extra analysis later, so include such nodes into a special list.
7718 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7719 SmallVector<Value *> PointerOps;
7720 StridedPtrInfo SPtrInfo;
7721 OrdersType CurrentOrder;
7722 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7723 CurrentOrder, PointerOps, SPtrInfo);
7726 return std::move(CurrentOrder);
7727 }
7728 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
7729 // has been auditted for correctness with non-power-of-two vectors.
7730 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
7731 if (std::optional<OrdersType> CurrentOrder =
7732 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
7733 return CurrentOrder;
7734 }
7735 return std::nullopt;
7736}
7737
7738/// Checks if the given mask is a "clustered" mask with the same clusters of
7739/// size \p Sz, which are not identity submasks.
7741 unsigned Sz) {
7742 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
7743 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
7744 return false;
7745 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
7746 ArrayRef<int> Cluster = Mask.slice(I, Sz);
7747 if (Cluster != FirstCluster)
7748 return false;
7749 }
7750 return true;
7751}
7752
7753void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
7754 // Reorder reuses mask.
7755 reorderReuses(TE.ReuseShuffleIndices, Mask);
7756 const unsigned Sz = TE.Scalars.size();
7757 // For vectorized and non-clustered reused no need to do anything else.
7758 if (!TE.isGather() ||
7760 Sz) ||
7761 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
7762 return;
7763 SmallVector<int> NewMask;
7764 inversePermutation(TE.ReorderIndices, NewMask);
7765 addMask(NewMask, TE.ReuseShuffleIndices);
7766 // Clear reorder since it is going to be applied to the new mask.
7767 TE.ReorderIndices.clear();
7768 // Try to improve gathered nodes with clustered reuses, if possible.
7769 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
7770 SmallVector<unsigned> NewOrder(Slice);
7771 inversePermutation(NewOrder, NewMask);
7772 reorderScalars(TE.Scalars, NewMask);
7773 // Fill the reuses mask with the identity submasks.
7774 for (auto *It = TE.ReuseShuffleIndices.begin(),
7775 *End = TE.ReuseShuffleIndices.end();
7776 It != End; std::advance(It, Sz))
7777 std::iota(It, std::next(It, Sz), 0);
7778}
7779
7781 ArrayRef<unsigned> SecondaryOrder) {
7782 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
7783 "Expected same size of orders");
7784 size_t Sz = Order.size();
7785 SmallBitVector UsedIndices(Sz);
7786 for (unsigned Idx : seq<unsigned>(0, Sz)) {
7787 if (Order[Idx] != Sz)
7788 UsedIndices.set(Order[Idx]);
7789 }
7790 if (SecondaryOrder.empty()) {
7791 for (unsigned Idx : seq<unsigned>(0, Sz))
7792 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
7793 Order[Idx] = Idx;
7794 } else {
7795 for (unsigned Idx : seq<unsigned>(0, Sz))
7796 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7797 !UsedIndices.test(SecondaryOrder[Idx]))
7798 Order[Idx] = SecondaryOrder[Idx];
7799 }
7800}
7801
7804 return false;
7805
7806 constexpr unsigned TinyVF = 2;
7807 constexpr unsigned TinyTree = 10;
7808 constexpr unsigned PhiOpsLimit = 12;
7809 constexpr unsigned GatherLoadsLimit = 2;
7810 if (VectorizableTree.size() <= TinyTree)
7811 return true;
7812 if (VectorizableTree.front()->hasState() &&
7813 !VectorizableTree.front()->isGather() &&
7814 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7815 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7816 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7817 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7818 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7819 VectorizableTree.front()->ReorderIndices.empty()) {
7820 // Check if the tree has only single store and single (unordered) load node,
7821 // other nodes are phis or geps/binops, combined with phis, and/or single
7822 // gather load node
7823 if (VectorizableTree.front()->hasState() &&
7824 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7825 VectorizableTree.front()->Scalars.size() == TinyVF &&
7826 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7827 return false;
7828 // Single node, which require reorder - skip.
7829 if (VectorizableTree.front()->hasState() &&
7830 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7831 VectorizableTree.front()->ReorderIndices.empty()) {
7832 const unsigned ReorderedSplitsCnt =
7833 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7834 return TE->State == TreeEntry::SplitVectorize &&
7835 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7836 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7837 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
7838 });
7839 if (ReorderedSplitsCnt <= 1 &&
7840 static_cast<unsigned>(count_if(
7841 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7842 return ((!TE->isGather() &&
7843 (TE->ReorderIndices.empty() ||
7844 (TE->UserTreeIndex.UserTE &&
7845 TE->UserTreeIndex.UserTE->State ==
7846 TreeEntry::Vectorize &&
7847 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7848 .empty()))) ||
7849 (TE->isGather() && TE->ReorderIndices.empty() &&
7850 (!TE->hasState() || TE->isAltShuffle() ||
7851 TE->getOpcode() == Instruction::Load ||
7852 TE->getOpcode() == Instruction::ZExt ||
7853 TE->getOpcode() == Instruction::SExt))) &&
7854 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7855 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
7856 return !isConstant(V) && isVectorized(V);
7857 }));
7858 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7859 return false;
7860 }
7861 bool HasPhis = false;
7862 bool HasLoad = true;
7863 unsigned GatherLoads = 0;
7864 for (const std::unique_ptr<TreeEntry> &TE :
7865 ArrayRef(VectorizableTree).drop_front()) {
7866 if (TE->State == TreeEntry::SplitVectorize)
7867 continue;
7868 if (!TE->hasState()) {
7869 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
7871 continue;
7872 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7874 continue;
7875 return true;
7876 }
7877 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7878 if (!TE->isGather()) {
7879 HasLoad = false;
7880 continue;
7881 }
7882 if (HasLoad)
7883 return true;
7884 ++GatherLoads;
7885 if (GatherLoads >= GatherLoadsLimit)
7886 return true;
7887 }
7888 if (TE->getOpcode() == Instruction::GetElementPtr ||
7889 Instruction::isBinaryOp(TE->getOpcode()))
7890 continue;
7891 if (TE->getOpcode() != Instruction::PHI &&
7892 (!TE->hasCopyableElements() ||
7893 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
7894 TE->Scalars.size() / 2))
7895 return true;
7896 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7897 TE->getNumOperands() > PhiOpsLimit)
7898 return false;
7899 HasPhis = true;
7900 }
7901 return !HasPhis;
7902 }
7903 return true;
7904}
7905
7906void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
7907 ArrayRef<int> MaskOrder) {
7908 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
7909 SmallVector<int> NewMask(getVectorFactor());
7910 SmallVector<int> NewMaskOrder(getVectorFactor());
7911 std::iota(NewMask.begin(), NewMask.end(), 0);
7912 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
7913 if (Idx == 0) {
7914 copy(Mask, NewMask.begin());
7915 copy(MaskOrder, NewMaskOrder.begin());
7916 } else {
7917 assert(Idx == 1 && "Expected either 0 or 1 index.");
7918 unsigned Offset = CombinedEntriesWithIndices.back().second;
7919 for (unsigned I : seq<unsigned>(Mask.size())) {
7920 NewMask[I + Offset] = Mask[I] + Offset;
7921 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
7922 }
7923 }
7924 reorderScalars(Scalars, NewMask);
7925 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
7926 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
7927 ReorderIndices.clear();
7928}
7929
7931 // Maps VF to the graph nodes.
7933 // ExtractElement gather nodes which can be vectorized and need to handle
7934 // their ordering.
7936
7937 // Phi nodes can have preferred ordering based on their result users
7939
7940 // AltShuffles can also have a preferred ordering that leads to fewer
7941 // instructions, e.g., the addsub instruction in x86.
7942 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
7943
7944 // Maps a TreeEntry to the reorder indices of external users.
7946 ExternalUserReorderMap;
7947 // Find all reorderable nodes with the given VF.
7948 // Currently the are vectorized stores,loads,extracts + some gathering of
7949 // extracts.
7950 for_each(VectorizableTree, [&, &TTIRef = *TTI](
7951 const std::unique_ptr<TreeEntry> &TE) {
7952 // Look for external users that will probably be vectorized.
7953 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
7954 findExternalStoreUsersReorderIndices(TE.get());
7955 if (!ExternalUserReorderIndices.empty()) {
7956 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7957 ExternalUserReorderMap.try_emplace(TE.get(),
7958 std::move(ExternalUserReorderIndices));
7959 }
7960
7961 // Patterns like [fadd,fsub] can be combined into a single instruction in
7962 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
7963 // to take into account their order when looking for the most used order.
7964 if (TE->hasState() && TE->isAltShuffle() &&
7965 TE->State != TreeEntry::SplitVectorize) {
7966 Type *ScalarTy = TE->Scalars[0]->getType();
7967 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
7968 unsigned Opcode0 = TE->getOpcode();
7969 unsigned Opcode1 = TE->getAltOpcode();
7970 SmallBitVector OpcodeMask(
7971 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
7972 // If this pattern is supported by the target then we consider the order.
7973 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7974 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
7975 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
7976 }
7977 // TODO: Check the reverse order too.
7978 }
7979
7980 bool IgnoreReorder =
7981 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7982 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
7983 VectorizableTree.front()->getOpcode() == Instruction::Store);
7984 if (std::optional<OrdersType> CurrentOrder =
7985 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
7986 // Do not include ordering for nodes used in the alt opcode vectorization,
7987 // better to reorder them during bottom-to-top stage. If follow the order
7988 // here, it causes reordering of the whole graph though actually it is
7989 // profitable just to reorder the subgraph that starts from the alternate
7990 // opcode vectorization node. Such nodes already end-up with the shuffle
7991 // instruction and it is just enough to change this shuffle rather than
7992 // rotate the scalars for the whole graph.
7993 unsigned Cnt = 0;
7994 const TreeEntry *UserTE = TE.get();
7995 while (UserTE && Cnt < RecursionMaxDepth) {
7996 if (!UserTE->UserTreeIndex)
7997 break;
7998 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7999 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8000 UserTE->UserTreeIndex.UserTE->Idx != 0)
8001 return;
8002 UserTE = UserTE->UserTreeIndex.UserTE;
8003 ++Cnt;
8004 }
8005 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8006 if (!(TE->State == TreeEntry::Vectorize ||
8007 TE->State == TreeEntry::StridedVectorize ||
8008 TE->State == TreeEntry::SplitVectorize ||
8009 TE->State == TreeEntry::CompressVectorize) ||
8010 !TE->ReuseShuffleIndices.empty())
8011 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8012 if (TE->State == TreeEntry::Vectorize &&
8013 TE->getOpcode() == Instruction::PHI)
8014 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8015 }
8016 });
8017
8018 // Reorder the graph nodes according to their vectorization factor.
8019 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8020 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8021 auto It = VFToOrderedEntries.find(VF);
8022 if (It == VFToOrderedEntries.end())
8023 continue;
8024 // Try to find the most profitable order. We just are looking for the most
8025 // used order and reorder scalar elements in the nodes according to this
8026 // mostly used order.
8027 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8028 // Delete VF entry upon exit.
8029 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
8030
8031 // All operands are reordered and used only in this node - propagate the
8032 // most used order to the user node.
8035 OrdersUses;
8036 for (const TreeEntry *OpTE : OrderedEntries) {
8037 // No need to reorder this nodes, still need to extend and to use shuffle,
8038 // just need to merge reordering shuffle and the reuse shuffle.
8039 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8040 OpTE->State != TreeEntry::SplitVectorize)
8041 continue;
8042 // Count number of orders uses.
8043 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8044 &PhisToOrders]() -> const OrdersType & {
8045 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8046 auto It = GathersToOrders.find(OpTE);
8047 if (It != GathersToOrders.end())
8048 return It->second;
8049 }
8050 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8051 auto It = AltShufflesToOrders.find(OpTE);
8052 if (It != AltShufflesToOrders.end())
8053 return It->second;
8054 }
8055 if (OpTE->State == TreeEntry::Vectorize &&
8056 OpTE->getOpcode() == Instruction::PHI) {
8057 auto It = PhisToOrders.find(OpTE);
8058 if (It != PhisToOrders.end())
8059 return It->second;
8060 }
8061 return OpTE->ReorderIndices;
8062 }();
8063 // First consider the order of the external scalar users.
8064 auto It = ExternalUserReorderMap.find(OpTE);
8065 if (It != ExternalUserReorderMap.end()) {
8066 const auto &ExternalUserReorderIndices = It->second;
8067 // If the OpTE vector factor != number of scalars - use natural order,
8068 // it is an attempt to reorder node with reused scalars but with
8069 // external uses.
8070 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8071 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8072 ExternalUserReorderIndices.size();
8073 } else {
8074 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8075 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8076 }
8077 // No other useful reorder data in this entry.
8078 if (Order.empty())
8079 continue;
8080 }
8081 // Stores actually store the mask, not the order, need to invert.
8082 if (OpTE->State == TreeEntry::Vectorize &&
8083 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8084 assert(!OpTE->isAltShuffle() &&
8085 "Alternate instructions are only supported by BinaryOperator "
8086 "and CastInst.");
8087 SmallVector<int> Mask;
8088 inversePermutation(Order, Mask);
8089 unsigned E = Order.size();
8090 OrdersType CurrentOrder(E, E);
8091 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8092 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8093 });
8094 fixupOrderingIndices(CurrentOrder);
8095 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8096 } else {
8097 ++OrdersUses.try_emplace(Order, 0).first->second;
8098 }
8099 }
8100 if (OrdersUses.empty())
8101 continue;
8102 // Choose the most used order.
8103 unsigned IdentityCnt = 0;
8104 unsigned FilledIdentityCnt = 0;
8105 OrdersType IdentityOrder(VF, VF);
8106 for (auto &Pair : OrdersUses) {
8107 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8108 if (!Pair.first.empty())
8109 FilledIdentityCnt += Pair.second;
8110 IdentityCnt += Pair.second;
8111 combineOrders(IdentityOrder, Pair.first);
8112 }
8113 }
8114 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8115 unsigned Cnt = IdentityCnt;
8116 for (auto &Pair : OrdersUses) {
8117 // Prefer identity order. But, if filled identity found (non-empty order)
8118 // with same number of uses, as the new candidate order, we can choose
8119 // this candidate order.
8120 if (Cnt < Pair.second ||
8121 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8122 Cnt == Pair.second && !BestOrder.empty() &&
8123 isIdentityOrder(BestOrder))) {
8124 combineOrders(Pair.first, BestOrder);
8125 BestOrder = Pair.first;
8126 Cnt = Pair.second;
8127 } else {
8128 combineOrders(BestOrder, Pair.first);
8129 }
8130 }
8131 // Set order of the user node.
8132 if (isIdentityOrder(BestOrder))
8133 continue;
8134 fixupOrderingIndices(BestOrder);
8135 SmallVector<int> Mask;
8136 inversePermutation(BestOrder, Mask);
8137 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8138 unsigned E = BestOrder.size();
8139 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8140 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8141 });
8142 // Do an actual reordering, if profitable.
8143 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8144 // Just do the reordering for the nodes with the given VF.
8145 if (TE->Scalars.size() != VF) {
8146 if (TE->ReuseShuffleIndices.size() == VF) {
8147 assert(TE->State != TreeEntry::SplitVectorize &&
8148 "Split vectorized not expected.");
8149 // Need to reorder the reuses masks of the operands with smaller VF to
8150 // be able to find the match between the graph nodes and scalar
8151 // operands of the given node during vectorization/cost estimation.
8152 assert(
8153 (!TE->UserTreeIndex ||
8154 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8155 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8156 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8157 "All users must be of VF size.");
8158 if (SLPReVec) {
8159 assert(SLPReVec && "Only supported by REVEC.");
8160 // ShuffleVectorInst does not do reorderOperands (and it should not
8161 // because ShuffleVectorInst supports only a limited set of
8162 // patterns). Only do reorderNodeWithReuses if the user is not
8163 // ShuffleVectorInst.
8164 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8165 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8166 continue;
8167 }
8168 // Update ordering of the operands with the smaller VF than the given
8169 // one.
8170 reorderNodeWithReuses(*TE, Mask);
8171 // Update orders in user split vectorize nodes.
8172 if (TE->UserTreeIndex &&
8173 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8174 TE->UserTreeIndex.UserTE->reorderSplitNode(
8175 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8176 }
8177 continue;
8178 }
8179 if ((TE->State == TreeEntry::SplitVectorize &&
8180 TE->ReuseShuffleIndices.empty()) ||
8181 ((TE->State == TreeEntry::Vectorize ||
8182 TE->State == TreeEntry::StridedVectorize ||
8183 TE->State == TreeEntry::CompressVectorize) &&
8185 InsertElementInst>(TE->getMainOp()) ||
8186 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8187 assert(
8188 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8189 TE->ReuseShuffleIndices.empty())) &&
8190 "Alternate instructions are only supported by BinaryOperator "
8191 "and CastInst.");
8192 // Build correct orders for extract{element,value}, loads,
8193 // stores and alternate (split) nodes.
8194 reorderOrder(TE->ReorderIndices, Mask);
8195 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8196 TE->reorderOperands(Mask);
8197 } else {
8198 // Reorder the node and its operands.
8199 TE->reorderOperands(Mask);
8200 assert(TE->ReorderIndices.empty() &&
8201 "Expected empty reorder sequence.");
8202 reorderScalars(TE->Scalars, Mask);
8203 }
8204 if (!TE->ReuseShuffleIndices.empty()) {
8205 // Apply reversed order to keep the original ordering of the reused
8206 // elements to avoid extra reorder indices shuffling.
8207 OrdersType CurrentOrder;
8208 reorderOrder(CurrentOrder, MaskOrder);
8209 SmallVector<int> NewReuses;
8210 inversePermutation(CurrentOrder, NewReuses);
8211 addMask(NewReuses, TE->ReuseShuffleIndices);
8212 TE->ReuseShuffleIndices.swap(NewReuses);
8213 } else if (TE->UserTreeIndex &&
8214 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8215 // Update orders in user split vectorize nodes.
8216 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8217 Mask, MaskOrder);
8218 }
8219 }
8220}
8221
8222void BoUpSLP::buildReorderableOperands(
8223 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8224 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8225 SmallVectorImpl<TreeEntry *> &GatherOps) {
8226 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8227 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8228 return OpData.first == I &&
8229 (OpData.second->State == TreeEntry::Vectorize ||
8230 OpData.second->State == TreeEntry::StridedVectorize ||
8231 OpData.second->State == TreeEntry::CompressVectorize ||
8232 OpData.second->State == TreeEntry::SplitVectorize);
8233 }))
8234 continue;
8235 // Do not request operands, if they do not exist.
8236 if (UserTE->hasState()) {
8237 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8238 UserTE->getOpcode() == Instruction::ExtractValue)
8239 continue;
8240 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8241 continue;
8242 if (UserTE->getOpcode() == Instruction::Store &&
8243 UserTE->State == TreeEntry::Vectorize && I == 1)
8244 continue;
8245 if (UserTE->getOpcode() == Instruction::Load &&
8246 (UserTE->State == TreeEntry::Vectorize ||
8247 UserTE->State == TreeEntry::StridedVectorize ||
8248 UserTE->State == TreeEntry::CompressVectorize))
8249 continue;
8250 }
8251 TreeEntry *TE = getOperandEntry(UserTE, I);
8252 assert(TE && "Expected operand entry.");
8253 if (!TE->isGather()) {
8254 // Add the node to the list of the ordered nodes with the identity
8255 // order.
8256 Edges.emplace_back(I, TE);
8257 // Add ScatterVectorize nodes to the list of operands, where just
8258 // reordering of the scalars is required. Similar to the gathers, so
8259 // simply add to the list of gathered ops.
8260 // If there are reused scalars, process this node as a regular vectorize
8261 // node, just reorder reuses mask.
8262 if (TE->State == TreeEntry::ScatterVectorize &&
8263 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8264 GatherOps.push_back(TE);
8265 continue;
8266 }
8267 if (ReorderableGathers.contains(TE))
8268 GatherOps.push_back(TE);
8269 }
8270}
8271
8272void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8273 struct TreeEntryCompare {
8274 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8275 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8276 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8277 return LHS->Idx < RHS->Idx;
8278 }
8279 };
8281 DenseSet<const TreeEntry *> GathersToOrders;
8282 // Find all reorderable leaf nodes with the given VF.
8283 // Currently the are vectorized loads,extracts without alternate operands +
8284 // some gathering of extracts.
8286 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8287 if (TE->State != TreeEntry::Vectorize &&
8288 TE->State != TreeEntry::StridedVectorize &&
8289 TE->State != TreeEntry::CompressVectorize &&
8290 TE->State != TreeEntry::SplitVectorize)
8291 NonVectorized.insert(TE.get());
8292 if (std::optional<OrdersType> CurrentOrder =
8293 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8294 Queue.push(TE.get());
8295 if (!(TE->State == TreeEntry::Vectorize ||
8296 TE->State == TreeEntry::StridedVectorize ||
8297 TE->State == TreeEntry::CompressVectorize ||
8298 TE->State == TreeEntry::SplitVectorize) ||
8299 !TE->ReuseShuffleIndices.empty())
8300 GathersToOrders.insert(TE.get());
8301 }
8302 }
8303
8304 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8305 // I.e., if the node has operands, that are reordered, try to make at least
8306 // one operand order in the natural order and reorder others + reorder the
8307 // user node itself.
8308 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8309 while (!Queue.empty()) {
8310 // 1. Filter out only reordered nodes.
8311 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8312 TreeEntry *TE = Queue.top();
8313 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8314 Queue.pop();
8315 SmallVector<TreeEntry *> OrderedOps(1, TE);
8316 while (!Queue.empty()) {
8317 TE = Queue.top();
8318 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8319 break;
8320 Queue.pop();
8321 OrderedOps.push_back(TE);
8322 }
8323 for (TreeEntry *TE : OrderedOps) {
8324 if (!(TE->State == TreeEntry::Vectorize ||
8325 TE->State == TreeEntry::StridedVectorize ||
8326 TE->State == TreeEntry::CompressVectorize ||
8327 TE->State == TreeEntry::SplitVectorize ||
8328 (TE->isGather() && GathersToOrders.contains(TE))) ||
8329 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8330 !Visited.insert(TE).second)
8331 continue;
8332 // Build a map between user nodes and their operands order to speedup
8333 // search. The graph currently does not provide this dependency directly.
8334 Users.first = TE->UserTreeIndex.UserTE;
8335 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8336 }
8337 if (Users.first) {
8338 auto &Data = Users;
8339 if (Data.first->State == TreeEntry::SplitVectorize) {
8340 assert(
8341 Data.second.size() <= 2 &&
8342 "Expected not greater than 2 operands for split vectorize node.");
8343 if (any_of(Data.second,
8344 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8345 continue;
8346 // Update orders in user split vectorize nodes.
8347 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8348 "Expected exactly 2 entries.");
8349 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8350 TreeEntry &OpTE = *VectorizableTree[P.first];
8351 OrdersType Order = OpTE.ReorderIndices;
8352 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8353 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8354 continue;
8355 const auto BestOrder =
8356 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8357 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8358 continue;
8359 Order = *BestOrder;
8360 }
8361 fixupOrderingIndices(Order);
8362 SmallVector<int> Mask;
8363 inversePermutation(Order, Mask);
8364 const unsigned E = Order.size();
8365 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8366 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8367 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8368 });
8369 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8370 // Clear ordering of the operand.
8371 if (!OpTE.ReorderIndices.empty()) {
8372 OpTE.ReorderIndices.clear();
8373 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8374 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8375 } else {
8376 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8377 reorderScalars(OpTE.Scalars, Mask);
8378 }
8379 }
8380 if (Data.first->ReuseShuffleIndices.empty() &&
8381 !Data.first->ReorderIndices.empty()) {
8382 // Insert user node to the list to try to sink reordering deeper in
8383 // the graph.
8384 Queue.push(Data.first);
8385 }
8386 continue;
8387 }
8388 // Check that operands are used only in the User node.
8389 SmallVector<TreeEntry *> GatherOps;
8390 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8391 GatherOps);
8392 // All operands are reordered and used only in this node - propagate the
8393 // most used order to the user node.
8396 OrdersUses;
8397 // Do the analysis for each tree entry only once, otherwise the order of
8398 // the same node my be considered several times, though might be not
8399 // profitable.
8402 for (const auto &Op : Data.second) {
8403 TreeEntry *OpTE = Op.second;
8404 if (!VisitedOps.insert(OpTE).second)
8405 continue;
8406 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8407 continue;
8408 const auto Order = [&]() -> const OrdersType {
8409 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8410 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8411 IgnoreReorder)
8412 .value_or(OrdersType(1));
8413 return OpTE->ReorderIndices;
8414 }();
8415 // The order is partially ordered, skip it in favor of fully non-ordered
8416 // orders.
8417 if (Order.size() == 1)
8418 continue;
8419
8420 // Check that the reordering does not increase number of shuffles, i.e.
8421 // same-values-nodes has same parents or their parents has same parents.
8422 if (!Order.empty() && !isIdentityOrder(Order)) {
8423 Value *Root = OpTE->hasState()
8424 ? OpTE->getMainOp()
8425 : *find_if_not(OpTE->Scalars, isConstant);
8426 auto GetSameNodesUsers = [&](Value *Root) {
8428 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8429 if (TE != OpTE && TE->UserTreeIndex &&
8430 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8431 TE->Scalars.size() == OpTE->Scalars.size() &&
8432 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8433 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8434 Res.insert(TE->UserTreeIndex.UserTE);
8435 }
8436 for (const TreeEntry *TE : getTreeEntries(Root)) {
8437 if (TE != OpTE && TE->UserTreeIndex &&
8438 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8439 TE->Scalars.size() == OpTE->Scalars.size() &&
8440 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8441 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8442 Res.insert(TE->UserTreeIndex.UserTE);
8443 }
8444 return Res.takeVector();
8445 };
8446 auto GetNumOperands = [](const TreeEntry *TE) {
8447 if (TE->State == TreeEntry::SplitVectorize)
8448 return TE->getNumOperands();
8449 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8450 return CI->arg_size();
8451 return TE->getNumOperands();
8452 };
8453 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8454 const TreeEntry *TE) {
8456 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8458 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8461 continue;
8462 const TreeEntry *Op = getOperandEntry(TE, Idx);
8463 if (Op->isGather() && Op->hasState()) {
8464 const TreeEntry *VecOp =
8465 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8466 if (VecOp)
8467 Op = VecOp;
8468 }
8469 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8470 return false;
8471 }
8472 return true;
8473 };
8474 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8475 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8476 if (!RevisitedOps.insert(UTE).second)
8477 return false;
8478 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8479 !UTE->ReuseShuffleIndices.empty() ||
8480 (UTE->UserTreeIndex &&
8481 UTE->UserTreeIndex.UserTE == Data.first) ||
8482 (Data.first->UserTreeIndex &&
8483 Data.first->UserTreeIndex.UserTE == UTE) ||
8484 (IgnoreReorder && UTE->UserTreeIndex &&
8485 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8486 NodeShouldBeReorderedWithOperands(UTE);
8487 }))
8488 continue;
8489 for (TreeEntry *UTE : Users) {
8491 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8493 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8496 continue;
8497 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8498 Visited.erase(Op);
8499 Queue.push(const_cast<TreeEntry *>(Op));
8500 }
8501 }
8502 }
8503 unsigned NumOps = count_if(
8504 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
8505 return P.second == OpTE;
8506 });
8507 // Stores actually store the mask, not the order, need to invert.
8508 if (OpTE->State == TreeEntry::Vectorize &&
8509 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8510 assert(!OpTE->isAltShuffle() &&
8511 "Alternate instructions are only supported by BinaryOperator "
8512 "and CastInst.");
8513 SmallVector<int> Mask;
8514 inversePermutation(Order, Mask);
8515 unsigned E = Order.size();
8516 OrdersType CurrentOrder(E, E);
8517 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8518 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8519 });
8520 fixupOrderingIndices(CurrentOrder);
8521 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8522 } else {
8523 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8524 }
8525 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
8526 const auto AllowsReordering = [&](const TreeEntry *TE) {
8527 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8528 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8529 (IgnoreReorder && TE->Idx == 0))
8530 return true;
8531 if (TE->isGather()) {
8532 if (GathersToOrders.contains(TE))
8533 return !getReorderingData(*TE, /*TopToBottom=*/false,
8534 IgnoreReorder)
8535 .value_or(OrdersType(1))
8536 .empty();
8537 return true;
8538 }
8539 return false;
8540 };
8541 if (OpTE->UserTreeIndex) {
8542 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8543 if (!VisitedUsers.insert(UserTE).second)
8544 continue;
8545 // May reorder user node if it requires reordering, has reused
8546 // scalars, is an alternate op vectorize node or its op nodes require
8547 // reordering.
8548 if (AllowsReordering(UserTE))
8549 continue;
8550 // Check if users allow reordering.
8551 // Currently look up just 1 level of operands to avoid increase of
8552 // the compile time.
8553 // Profitable to reorder if definitely more operands allow
8554 // reordering rather than those with natural order.
8556 if (static_cast<unsigned>(count_if(
8557 Ops, [UserTE, &AllowsReordering](
8558 const std::pair<unsigned, TreeEntry *> &Op) {
8559 return AllowsReordering(Op.second) &&
8560 Op.second->UserTreeIndex.UserTE == UserTE;
8561 })) <= Ops.size() / 2)
8562 ++Res.first->second;
8563 }
8564 }
8565 if (OrdersUses.empty()) {
8566 Visited.insert_range(llvm::make_second_range(Data.second));
8567 continue;
8568 }
8569 // Choose the most used order.
8570 unsigned IdentityCnt = 0;
8571 unsigned VF = Data.second.front().second->getVectorFactor();
8572 OrdersType IdentityOrder(VF, VF);
8573 for (auto &Pair : OrdersUses) {
8574 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8575 IdentityCnt += Pair.second;
8576 combineOrders(IdentityOrder, Pair.first);
8577 }
8578 }
8579 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8580 unsigned Cnt = IdentityCnt;
8581 for (auto &Pair : OrdersUses) {
8582 // Prefer identity order. But, if filled identity found (non-empty
8583 // order) with same number of uses, as the new candidate order, we can
8584 // choose this candidate order.
8585 if (Cnt < Pair.second) {
8586 combineOrders(Pair.first, BestOrder);
8587 BestOrder = Pair.first;
8588 Cnt = Pair.second;
8589 } else {
8590 combineOrders(BestOrder, Pair.first);
8591 }
8592 }
8593 // Set order of the user node.
8594 if (isIdentityOrder(BestOrder)) {
8595 Visited.insert_range(llvm::make_second_range(Data.second));
8596 continue;
8597 }
8598 fixupOrderingIndices(BestOrder);
8599 // Erase operands from OrderedEntries list and adjust their orders.
8600 VisitedOps.clear();
8601 SmallVector<int> Mask;
8602 inversePermutation(BestOrder, Mask);
8603 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8604 unsigned E = BestOrder.size();
8605 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8606 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8607 });
8608 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
8609 TreeEntry *TE = Op.second;
8610 if (!VisitedOps.insert(TE).second)
8611 continue;
8612 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
8613 reorderNodeWithReuses(*TE, Mask);
8614 continue;
8615 }
8616 // Gathers are processed separately.
8617 if (TE->State != TreeEntry::Vectorize &&
8618 TE->State != TreeEntry::StridedVectorize &&
8619 TE->State != TreeEntry::CompressVectorize &&
8620 TE->State != TreeEntry::SplitVectorize &&
8621 (TE->State != TreeEntry::ScatterVectorize ||
8622 TE->ReorderIndices.empty()))
8623 continue;
8624 assert((BestOrder.size() == TE->ReorderIndices.size() ||
8625 TE->ReorderIndices.empty()) &&
8626 "Non-matching sizes of user/operand entries.");
8627 reorderOrder(TE->ReorderIndices, Mask);
8628 if (IgnoreReorder && TE == VectorizableTree.front().get())
8629 IgnoreReorder = false;
8630 }
8631 // For gathers just need to reorder its scalars.
8632 for (TreeEntry *Gather : GatherOps) {
8633 assert(Gather->ReorderIndices.empty() &&
8634 "Unexpected reordering of gathers.");
8635 if (!Gather->ReuseShuffleIndices.empty()) {
8636 // Just reorder reuses indices.
8637 reorderReuses(Gather->ReuseShuffleIndices, Mask);
8638 continue;
8639 }
8640 reorderScalars(Gather->Scalars, Mask);
8641 Visited.insert(Gather);
8642 }
8643 // Reorder operands of the user node and set the ordering for the user
8644 // node itself.
8645 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
8646 return TE.isAltShuffle() &&
8647 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8648 TE.ReorderIndices.empty());
8649 };
8650 if (Data.first->State != TreeEntry::Vectorize ||
8652 Data.first->getMainOp()) ||
8653 IsNotProfitableAltCodeNode(*Data.first))
8654 Data.first->reorderOperands(Mask);
8655 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
8656 IsNotProfitableAltCodeNode(*Data.first) ||
8657 Data.first->State == TreeEntry::StridedVectorize ||
8658 Data.first->State == TreeEntry::CompressVectorize) {
8659 reorderScalars(Data.first->Scalars, Mask);
8660 reorderOrder(Data.first->ReorderIndices, MaskOrder,
8661 /*BottomOrder=*/true);
8662 if (Data.first->ReuseShuffleIndices.empty() &&
8663 !Data.first->ReorderIndices.empty() &&
8664 !IsNotProfitableAltCodeNode(*Data.first)) {
8665 // Insert user node to the list to try to sink reordering deeper in
8666 // the graph.
8667 Queue.push(Data.first);
8668 }
8669 } else {
8670 reorderOrder(Data.first->ReorderIndices, Mask);
8671 }
8672 }
8673 }
8674 // If the reordering is unnecessary, just remove the reorder.
8675 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8676 VectorizableTree.front()->ReuseShuffleIndices.empty())
8677 VectorizableTree.front()->ReorderIndices.clear();
8678}
8679
8680Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
8681 if (Entry.hasState() &&
8682 (Entry.getOpcode() == Instruction::Store ||
8683 Entry.getOpcode() == Instruction::Load) &&
8684 Entry.State == TreeEntry::StridedVectorize &&
8685 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
8686 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8687 return dyn_cast<Instruction>(Entry.Scalars.front());
8688}
8689
8691 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
8692 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8693 DenseMap<Value *, unsigned> ScalarToExtUses;
8694 SmallPtrSet<Value *, 4> ExternalUsers;
8695 // Collect the values that we need to extract from the tree.
8696 for (auto &TEPtr : VectorizableTree) {
8697 TreeEntry *Entry = TEPtr.get();
8698
8699 // No need to handle users of gathered values.
8700 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8701 continue;
8702
8703 // For each lane:
8704 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8705 Value *Scalar = Entry->Scalars[Lane];
8706 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8707 continue;
8708
8709 // All uses must be replaced already? No need to do it again.
8710 auto It = ScalarToExtUses.find(Scalar);
8711 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
8712 continue;
8713
8714 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8715 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8716 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
8717 << " from " << *Scalar << "for many users.\n");
8718 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8719 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8720 ExternalUsesWithNonUsers.insert(Scalar);
8721 continue;
8722 }
8723
8724 // Check if the scalar is externally used as an extra arg.
8725 const auto ExtI = ExternallyUsedValues.find(Scalar);
8726 if (ExtI != ExternallyUsedValues.end()) {
8727 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8728 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
8729 << FoundLane << " from " << *Scalar << ".\n");
8730 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
8731 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8732 continue;
8733 }
8734 for (User *U : Scalar->users()) {
8735 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
8736
8737 Instruction *UserInst = dyn_cast<Instruction>(U);
8738 if (!UserInst || isDeleted(UserInst))
8739 continue;
8740
8741 // Ignore users in the user ignore list.
8742 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8743 continue;
8744
8745 // Skip in-tree scalars that become vectors
8746 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
8747 !UseEntries.empty()) {
8748 // Some in-tree scalars will remain as scalar in vectorized
8749 // instructions. If that is the case, the one in FoundLane will
8750 // be used.
8751 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8752 isa<LoadInst, StoreInst>(UserInst)) ||
8753 isa<CallInst>(UserInst)) ||
8754 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8755 return UseEntry->State == TreeEntry::ScatterVectorize ||
8757 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8758 TTI);
8759 })) {
8760 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
8761 << ".\n");
8762 assert(none_of(UseEntries,
8763 [](TreeEntry *UseEntry) {
8764 return UseEntry->isGather();
8765 }) &&
8766 "Bad state");
8767 continue;
8768 }
8769 U = nullptr;
8770 if (It != ScalarToExtUses.end()) {
8771 ExternalUses[It->second].User = nullptr;
8772 break;
8773 }
8774 }
8775
8776 if (U && Scalar->hasNUsesOrMore(UsesLimit))
8777 U = nullptr;
8778 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8779 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
8780 << " from lane " << FoundLane << " from " << *Scalar
8781 << ".\n");
8782 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8783 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8784 ExternalUsesWithNonUsers.insert(Scalar);
8785 if (!U)
8786 break;
8787 }
8788 }
8789 }
8790}
8791
8793BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
8796 PtrToStoresMap;
8797 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
8798 Value *V = TE->Scalars[Lane];
8799 // Don't iterate over the users of constant data.
8800 if (!isa<Instruction>(V))
8801 continue;
8802 // To save compilation time we don't visit if we have too many users.
8803 if (V->hasNUsesOrMore(UsesLimit))
8804 break;
8805
8806 // Collect stores per pointer object.
8807 for (User *U : V->users()) {
8808 auto *SI = dyn_cast<StoreInst>(U);
8809 // Test whether we can handle the store. V might be a global, which could
8810 // be used in a different function.
8811 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
8812 !isValidElementType(SI->getValueOperand()->getType()))
8813 continue;
8814 // Skip entry if already
8815 if (isVectorized(U))
8816 continue;
8817
8818 Value *Ptr =
8819 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
8820 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
8821 SI->getValueOperand()->getType(), Ptr}];
8822 // For now just keep one store per pointer object per lane.
8823 // TODO: Extend this to support multiple stores per pointer per lane
8824 if (StoresVec.size() > Lane)
8825 continue;
8826 if (!StoresVec.empty()) {
8827 std::optional<int64_t> Diff = getPointersDiff(
8828 SI->getValueOperand()->getType(), SI->getPointerOperand(),
8829 SI->getValueOperand()->getType(),
8830 StoresVec.front()->getPointerOperand(), *DL, *SE,
8831 /*StrictCheck=*/true);
8832 // We failed to compare the pointers so just abandon this store.
8833 if (!Diff)
8834 continue;
8835 }
8836 StoresVec.push_back(SI);
8837 }
8838 }
8839 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
8840 unsigned I = 0;
8841 for (auto &P : PtrToStoresMap) {
8842 Res[I].swap(P.second);
8843 ++I;
8844 }
8845 return Res;
8846}
8847
8848bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
8849 OrdersType &ReorderIndices) const {
8850 // We check whether the stores in StoreVec can form a vector by sorting them
8851 // and checking whether they are consecutive.
8852
8853 // To avoid calling getPointersDiff() while sorting we create a vector of
8854 // pairs {store, offset from first} and sort this instead.
8856 StoreInst *S0 = StoresVec[0];
8857 StoreOffsetVec.emplace_back(0, 0);
8858 Type *S0Ty = S0->getValueOperand()->getType();
8859 Value *S0Ptr = S0->getPointerOperand();
8860 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
8861 StoreInst *SI = StoresVec[Idx];
8862 std::optional<int64_t> Diff =
8863 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
8864 SI->getPointerOperand(), *DL, *SE,
8865 /*StrictCheck=*/true);
8866 StoreOffsetVec.emplace_back(*Diff, Idx);
8867 }
8868
8869 // Check if the stores are consecutive by checking if their difference is 1.
8870 if (StoreOffsetVec.size() != StoresVec.size())
8871 return false;
8872 sort(StoreOffsetVec, llvm::less_first());
8873 unsigned Idx = 0;
8874 int64_t PrevDist = 0;
8875 for (const auto &P : StoreOffsetVec) {
8876 if (Idx > 0 && P.first != PrevDist + 1)
8877 return false;
8878 PrevDist = P.first;
8879 ++Idx;
8880 }
8881
8882 // Calculate the shuffle indices according to their offset against the sorted
8883 // StoreOffsetVec.
8884 ReorderIndices.assign(StoresVec.size(), 0);
8885 bool IsIdentity = true;
8886 for (auto [I, P] : enumerate(StoreOffsetVec)) {
8887 ReorderIndices[P.second] = I;
8888 IsIdentity &= P.second == I;
8889 }
8890 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
8891 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
8892 // same convention here.
8893 if (IsIdentity)
8894 ReorderIndices.clear();
8895
8896 return true;
8897}
8898
8899#ifndef NDEBUG
8901 for (unsigned Idx : Order)
8902 dbgs() << Idx << ", ";
8903 dbgs() << "\n";
8904}
8905#endif
8906
8908BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
8909 unsigned NumLanes = TE->Scalars.size();
8910
8911 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
8912
8913 // Holds the reorder indices for each candidate store vector that is a user of
8914 // the current TreeEntry.
8915 SmallVector<OrdersType, 1> ExternalReorderIndices;
8916
8917 // Now inspect the stores collected per pointer and look for vectorization
8918 // candidates. For each candidate calculate the reorder index vector and push
8919 // it into `ExternalReorderIndices`
8920 for (ArrayRef<StoreInst *> StoresVec : Stores) {
8921 // If we have fewer than NumLanes stores, then we can't form a vector.
8922 if (StoresVec.size() != NumLanes)
8923 continue;
8924
8925 // If the stores are not consecutive then abandon this StoresVec.
8926 OrdersType ReorderIndices;
8927 if (!canFormVector(StoresVec, ReorderIndices))
8928 continue;
8929
8930 // We now know that the scalars in StoresVec can form a vector instruction,
8931 // so set the reorder indices.
8932 ExternalReorderIndices.push_back(ReorderIndices);
8933 }
8934 return ExternalReorderIndices;
8935}
8936
8938 const SmallDenseSet<Value *> &UserIgnoreLst) {
8939 deleteTree();
8940 assert(TreeEntryToStridedPtrInfoMap.empty() &&
8941 "TreeEntryToStridedPtrInfoMap is not cleared");
8942 UserIgnoreList = &UserIgnoreLst;
8943 if (!allSameType(Roots))
8944 return;
8945 buildTreeRec(Roots, 0, EdgeInfo());
8946}
8947
8949 deleteTree();
8950 assert(TreeEntryToStridedPtrInfoMap.empty() &&
8951 "TreeEntryToStridedPtrInfoMap is not cleared");
8952 if (!allSameType(Roots))
8953 return;
8954 buildTreeRec(Roots, 0, EdgeInfo());
8955}
8956
8957/// Tries to find subvector of loads and builds new vector of only loads if can
8958/// be profitable.
8960 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
8962 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
8963 bool AddNew = true) {
8964 if (VL.empty())
8965 return;
8966 Type *ScalarTy = getValueType(VL.front());
8967 if (!isValidElementType(ScalarTy))
8968 return;
8970 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
8971 for (Value *V : VL) {
8972 auto *LI = dyn_cast<LoadInst>(V);
8973 if (!LI)
8974 continue;
8975 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
8976 continue;
8977 bool IsFound = false;
8978 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
8979 assert(LI->getParent() == Data.front().first->getParent() &&
8980 LI->getType() == Data.front().first->getType() &&
8981 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
8982 getUnderlyingObject(Data.front().first->getPointerOperand(),
8984 "Expected loads with the same type, same parent and same "
8985 "underlying pointer.");
8986 std::optional<int64_t> Dist = getPointersDiff(
8987 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
8988 Data.front().first->getPointerOperand(), DL, SE,
8989 /*StrictCheck=*/true);
8990 if (!Dist)
8991 continue;
8992 auto It = Map.find(*Dist);
8993 if (It != Map.end() && It->second != LI)
8994 continue;
8995 if (It == Map.end()) {
8996 Data.emplace_back(LI, *Dist);
8997 Map.try_emplace(*Dist, LI);
8998 }
8999 IsFound = true;
9000 break;
9001 }
9002 if (!IsFound) {
9003 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9004 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9005 }
9006 }
9007 auto FindMatchingLoads =
9010 &GatheredLoads,
9011 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9012 int64_t &Offset, unsigned &Start) {
9013 if (Loads.empty())
9014 return GatheredLoads.end();
9015 LoadInst *LI = Loads.front().first;
9016 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9017 if (Idx < Start)
9018 continue;
9019 ToAdd.clear();
9020 if (LI->getParent() != Data.front().first->getParent() ||
9021 LI->getType() != Data.front().first->getType())
9022 continue;
9023 std::optional<int64_t> Dist =
9025 Data.front().first->getType(),
9026 Data.front().first->getPointerOperand(), DL, SE,
9027 /*StrictCheck=*/true);
9028 if (!Dist)
9029 continue;
9030 SmallSet<int64_t, 4> DataDists;
9032 for (std::pair<LoadInst *, int64_t> P : Data) {
9033 DataDists.insert(P.second);
9034 DataLoads.insert(P.first);
9035 }
9036 // Found matching gathered loads - check if all loads are unique or
9037 // can be effectively vectorized.
9038 unsigned NumUniques = 0;
9039 for (auto [Cnt, Pair] : enumerate(Loads)) {
9040 bool Used = DataLoads.contains(Pair.first);
9041 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9042 ++NumUniques;
9043 ToAdd.insert(Cnt);
9044 } else if (Used) {
9045 Repeated.insert(Cnt);
9046 }
9047 }
9048 if (NumUniques > 0 &&
9049 (Loads.size() == NumUniques ||
9050 (Loads.size() - NumUniques >= 2 &&
9051 Loads.size() - NumUniques >= Loads.size() / 2 &&
9052 (has_single_bit(Data.size() + NumUniques) ||
9053 bit_ceil(Data.size()) <
9054 bit_ceil(Data.size() + NumUniques))))) {
9055 Offset = *Dist;
9056 Start = Idx + 1;
9057 return std::next(GatheredLoads.begin(), Idx);
9058 }
9059 }
9060 ToAdd.clear();
9061 return GatheredLoads.end();
9062 };
9063 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9064 unsigned Start = 0;
9065 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9066 int64_t Offset = 0;
9067 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9068 Offset, Start);
9069 while (It != GatheredLoads.end()) {
9070 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9071 for (unsigned Idx : LocalToAdd)
9072 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9073 ToAdd.insert_range(LocalToAdd);
9074 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9075 Start);
9076 }
9077 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9078 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9079 })) {
9080 auto AddNewLoads =
9082 for (unsigned Idx : seq<unsigned>(Data.size())) {
9083 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9084 continue;
9085 Loads.push_back(Data[Idx]);
9086 }
9087 };
9088 if (!AddNew) {
9089 LoadInst *LI = Data.front().first;
9090 It = find_if(
9091 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9092 return PD.front().first->getParent() == LI->getParent() &&
9093 PD.front().first->getType() == LI->getType();
9094 });
9095 while (It != GatheredLoads.end()) {
9096 AddNewLoads(*It);
9097 It = std::find_if(
9098 std::next(It), GatheredLoads.end(),
9099 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9100 return PD.front().first->getParent() == LI->getParent() &&
9101 PD.front().first->getType() == LI->getType();
9102 });
9103 }
9104 }
9105 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9106 AddNewLoads(GatheredLoads.emplace_back());
9107 }
9108 }
9109}
9110
9111void BoUpSLP::tryToVectorizeGatheredLoads(
9112 const SmallMapVector<
9113 std::tuple<BasicBlock *, Value *, Type *>,
9114 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9115 &GatheredLoads) {
9116 GatheredLoadsEntriesFirst = VectorizableTree.size();
9117
9118 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9119 LoadEntriesToVectorize.size());
9120 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9121 Set.insert_range(VectorizableTree[Idx]->Scalars);
9122
9123 // Sort loads by distance.
9124 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9125 const std::pair<LoadInst *, int64_t> &L2) {
9126 return L1.second > L2.second;
9127 };
9128
9129 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9130 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9131 Loads.size());
9132 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9133 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9134 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9135 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9136 };
9137
9138 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9139 BoUpSLP::ValueSet &VectorizedLoads,
9140 SmallVectorImpl<LoadInst *> &NonVectorized,
9141 bool Final, unsigned MaxVF) {
9143 unsigned StartIdx = 0;
9144 SmallVector<int> CandidateVFs;
9145 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9146 CandidateVFs.push_back(MaxVF);
9147 for (int NumElts = getFloorFullVectorNumberOfElements(
9148 *TTI, Loads.front()->getType(), MaxVF);
9149 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9150 *TTI, Loads.front()->getType(), NumElts - 1)) {
9151 CandidateVFs.push_back(NumElts);
9152 if (VectorizeNonPowerOf2 && NumElts > 2)
9153 CandidateVFs.push_back(NumElts - 1);
9154 }
9155
9156 if (Final && CandidateVFs.empty())
9157 return Results;
9158
9159 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9160 for (unsigned NumElts : CandidateVFs) {
9161 if (Final && NumElts > BestVF)
9162 continue;
9163 SmallVector<unsigned> MaskedGatherVectorized;
9164 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9165 ++Cnt) {
9166 ArrayRef<LoadInst *> Slice =
9167 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9168 if (VectorizedLoads.count(Slice.front()) ||
9169 VectorizedLoads.count(Slice.back()) ||
9171 continue;
9172 // Check if it is profitable to try vectorizing gathered loads. It is
9173 // profitable if we have more than 3 consecutive loads or if we have
9174 // less but all users are vectorized or deleted.
9175 bool AllowToVectorize = false;
9176 // Check if it is profitable to vectorize 2-elements loads.
9177 if (NumElts == 2) {
9178 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9179 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9180 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9181 for (LoadInst *LI : Slice) {
9182 // If single use/user - allow to vectorize.
9183 if (LI->hasOneUse())
9184 continue;
9185 // 1. Check if number of uses equals number of users.
9186 // 2. All users are deleted.
9187 // 3. The load broadcasts are not allowed or the load is not
9188 // broadcasted.
9189 if (static_cast<unsigned int>(std::distance(
9190 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9191 return false;
9192 if (!IsLegalBroadcastLoad)
9193 continue;
9194 if (LI->hasNUsesOrMore(UsesLimit))
9195 return false;
9196 for (User *U : LI->users()) {
9197 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9198 continue;
9199 for (const TreeEntry *UTE : getTreeEntries(U)) {
9200 for (int I : seq<int>(UTE->getNumOperands())) {
9201 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9202 return V == LI || isa<PoisonValue>(V);
9203 }))
9204 // Found legal broadcast - do not vectorize.
9205 return false;
9206 }
9207 }
9208 }
9209 }
9210 return true;
9211 };
9212 AllowToVectorize = CheckIfAllowed(Slice);
9213 } else {
9214 AllowToVectorize =
9215 (NumElts >= 3 ||
9216 any_of(ValueToGatherNodes.at(Slice.front()),
9217 [=](const TreeEntry *TE) {
9218 return TE->Scalars.size() == 2 &&
9219 ((TE->Scalars.front() == Slice.front() &&
9220 TE->Scalars.back() == Slice.back()) ||
9221 (TE->Scalars.front() == Slice.back() &&
9222 TE->Scalars.back() == Slice.front()));
9223 })) &&
9224 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9225 Slice.size());
9226 }
9227 if (AllowToVectorize) {
9228 SmallVector<Value *> PointerOps;
9229 OrdersType CurrentOrder;
9230 // Try to build vector load.
9231 ArrayRef<Value *> Values(
9232 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9233 StridedPtrInfo SPtrInfo;
9234 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9235 PointerOps, SPtrInfo, &BestVF);
9236 if (LS != LoadsState::Gather ||
9237 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9238 if (LS == LoadsState::ScatterVectorize) {
9239 if (MaskedGatherVectorized.empty() ||
9240 Cnt >= MaskedGatherVectorized.back() + NumElts)
9241 MaskedGatherVectorized.push_back(Cnt);
9242 continue;
9243 }
9244 if (LS != LoadsState::Gather) {
9245 Results.emplace_back(Values, LS);
9246 VectorizedLoads.insert_range(Slice);
9247 // If we vectorized initial block, no need to try to vectorize it
9248 // again.
9249 if (Cnt == StartIdx)
9250 StartIdx += NumElts;
9251 }
9252 // Check if the whole array was vectorized already - exit.
9253 if (StartIdx >= Loads.size())
9254 break;
9255 // Erase last masked gather candidate, if another candidate within
9256 // the range is found to be better.
9257 if (!MaskedGatherVectorized.empty() &&
9258 Cnt < MaskedGatherVectorized.back() + NumElts)
9259 MaskedGatherVectorized.pop_back();
9260 Cnt += NumElts - 1;
9261 continue;
9262 }
9263 }
9264 if (!AllowToVectorize || BestVF == 0)
9266 }
9267 // Mark masked gathers candidates as vectorized, if any.
9268 for (unsigned Cnt : MaskedGatherVectorized) {
9269 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9270 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9271 ArrayRef<Value *> Values(
9272 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9273 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9274 VectorizedLoads.insert_range(Slice);
9275 // If we vectorized initial block, no need to try to vectorize it again.
9276 if (Cnt == StartIdx)
9277 StartIdx += NumElts;
9278 }
9279 }
9280 for (LoadInst *LI : Loads) {
9281 if (!VectorizedLoads.contains(LI))
9282 NonVectorized.push_back(LI);
9283 }
9284 return Results;
9285 };
9286 auto ProcessGatheredLoads =
9287 [&, &TTI = *TTI](
9289 bool Final = false) {
9290 SmallVector<LoadInst *> NonVectorized;
9291 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9292 GatheredLoads) {
9293 if (LoadsDists.size() <= 1) {
9294 NonVectorized.push_back(LoadsDists.back().first);
9295 continue;
9296 }
9298 LoadsDists);
9299 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9300 stable_sort(LocalLoadsDists, LoadSorter);
9302 unsigned MaxConsecutiveDistance = 0;
9303 unsigned CurrentConsecutiveDist = 1;
9304 int64_t LastDist = LocalLoadsDists.front().second;
9305 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9306 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9307 if (isVectorized(L.first))
9308 continue;
9309 assert(LastDist >= L.second &&
9310 "Expected first distance always not less than second");
9311 if (static_cast<uint64_t>(LastDist - L.second) ==
9312 CurrentConsecutiveDist) {
9313 ++CurrentConsecutiveDist;
9314 MaxConsecutiveDistance =
9315 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9316 Loads.push_back(L.first);
9317 continue;
9318 }
9319 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9320 !Loads.empty())
9321 Loads.pop_back();
9322 CurrentConsecutiveDist = 1;
9323 LastDist = L.second;
9324 Loads.push_back(L.first);
9325 }
9326 if (Loads.size() <= 1)
9327 continue;
9328 if (AllowMaskedGather)
9329 MaxConsecutiveDistance = Loads.size();
9330 else if (MaxConsecutiveDistance < 2)
9331 continue;
9332 BoUpSLP::ValueSet VectorizedLoads;
9333 SmallVector<LoadInst *> SortedNonVectorized;
9335 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9336 Final, MaxConsecutiveDistance);
9337 if (!Results.empty() && !SortedNonVectorized.empty() &&
9338 OriginalLoads.size() == Loads.size() &&
9339 MaxConsecutiveDistance == Loads.size() &&
9341 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9342 return P.second == LoadsState::ScatterVectorize;
9343 })) {
9344 VectorizedLoads.clear();
9345 SmallVector<LoadInst *> UnsortedNonVectorized;
9347 UnsortedResults =
9348 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9349 UnsortedNonVectorized, Final,
9350 OriginalLoads.size());
9351 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9352 SortedNonVectorized.swap(UnsortedNonVectorized);
9353 Results.swap(UnsortedResults);
9354 }
9355 }
9356 for (auto [Slice, _] : Results) {
9357 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9358 << Slice.size() << ")\n");
9359 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9360 for (Value *L : Slice)
9361 if (!isVectorized(L))
9362 SortedNonVectorized.push_back(cast<LoadInst>(L));
9363 continue;
9364 }
9365
9366 // Select maximum VF as a maximum of user gathered nodes and
9367 // distance between scalar loads in these nodes.
9368 unsigned MaxVF = Slice.size();
9369 unsigned UserMaxVF = 0;
9370 unsigned InterleaveFactor = 0;
9371 if (MaxVF == 2) {
9372 UserMaxVF = MaxVF;
9373 } else {
9374 // Found distance between segments of the interleaved loads.
9375 std::optional<unsigned> InterleavedLoadsDistance = 0;
9376 unsigned Order = 0;
9377 std::optional<unsigned> CommonVF = 0;
9378 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9379 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9380 for (auto [Idx, V] : enumerate(Slice)) {
9381 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9382 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9383 unsigned Pos =
9384 EntryToPosition.try_emplace(E, Idx).first->second;
9385 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9386 if (CommonVF) {
9387 if (*CommonVF == 0) {
9388 CommonVF = E->Scalars.size();
9389 continue;
9390 }
9391 if (*CommonVF != E->Scalars.size())
9392 CommonVF.reset();
9393 }
9394 // Check if the load is the part of the interleaved load.
9395 if (Pos != Idx && InterleavedLoadsDistance) {
9396 if (!DeinterleavedNodes.contains(E) &&
9397 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9398 if (isa<Constant>(V))
9399 return false;
9400 if (isVectorized(V))
9401 return true;
9402 const auto &Nodes = ValueToGatherNodes.at(V);
9403 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9404 !is_contained(Slice, V);
9405 })) {
9406 InterleavedLoadsDistance.reset();
9407 continue;
9408 }
9409 DeinterleavedNodes.insert(E);
9410 if (*InterleavedLoadsDistance == 0) {
9411 InterleavedLoadsDistance = Idx - Pos;
9412 continue;
9413 }
9414 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9415 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9416 InterleavedLoadsDistance.reset();
9417 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9418 }
9419 }
9420 }
9421 DeinterleavedNodes.clear();
9422 // Check if the large load represents interleaved load operation.
9423 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9424 CommonVF.value_or(0) != 0) {
9425 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9426 unsigned VF = *CommonVF;
9427 OrdersType Order;
9428 SmallVector<Value *> PointerOps;
9429 StridedPtrInfo SPtrInfo;
9430 // Segmented load detected - vectorize at maximum vector factor.
9431 if (InterleaveFactor <= Slice.size() &&
9432 TTI.isLegalInterleavedAccessType(
9433 getWidenedType(Slice.front()->getType(), VF),
9434 InterleaveFactor,
9435 cast<LoadInst>(Slice.front())->getAlign(),
9436 cast<LoadInst>(Slice.front())
9438 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9439 SPtrInfo) == LoadsState::Vectorize) {
9440 UserMaxVF = InterleaveFactor * VF;
9441 } else {
9442 InterleaveFactor = 0;
9443 }
9444 }
9445 // Cannot represent the loads as consecutive vectorizable nodes -
9446 // just exit.
9447 unsigned ConsecutiveNodesSize = 0;
9448 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9449 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9450 [&, Slice = Slice](const auto &P) {
9451 const auto *It = find_if(Slice, [&](Value *V) {
9452 return std::get<1>(P).contains(V);
9453 });
9454 if (It == Slice.end())
9455 return false;
9456 const TreeEntry &TE =
9457 *VectorizableTree[std::get<0>(P)];
9458 ArrayRef<Value *> VL = TE.Scalars;
9459 OrdersType Order;
9460 SmallVector<Value *> PointerOps;
9461 StridedPtrInfo SPtrInfo;
9463 VL, VL.front(), Order, PointerOps, SPtrInfo);
9464 if (State == LoadsState::ScatterVectorize ||
9466 return false;
9467 ConsecutiveNodesSize += VL.size();
9468 size_t Start = std::distance(Slice.begin(), It);
9469 size_t Sz = Slice.size() - Start;
9470 return Sz < VL.size() ||
9471 Slice.slice(Start, VL.size()) != VL;
9472 }))
9473 continue;
9474 // Try to build long masked gather loads.
9475 UserMaxVF = bit_ceil(UserMaxVF);
9476 if (InterleaveFactor == 0 &&
9477 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9478 [&, Slice = Slice](unsigned Idx) {
9479 OrdersType Order;
9480 SmallVector<Value *> PointerOps;
9481 StridedPtrInfo SPtrInfo;
9482 return canVectorizeLoads(
9483 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9484 Slice[Idx * UserMaxVF], Order, PointerOps,
9485 SPtrInfo) == LoadsState::ScatterVectorize;
9486 }))
9487 UserMaxVF = MaxVF;
9488 if (Slice.size() != ConsecutiveNodesSize)
9489 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9490 }
9491 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9492 bool IsVectorized = true;
9493 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
9494 ArrayRef<Value *> SubSlice =
9495 Slice.slice(I, std::min(VF, E - I));
9496 if (isVectorized(SubSlice.front()))
9497 continue;
9498 // Check if the subslice is to be-vectorized entry, which is not
9499 // equal to entry.
9500 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9501 [&](const auto &P) {
9502 return !SubSlice.equals(
9503 VectorizableTree[std::get<0>(P)]
9504 ->Scalars) &&
9505 set_is_subset(SubSlice, std::get<1>(P));
9506 }))
9507 continue;
9508 unsigned Sz = VectorizableTree.size();
9509 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9510 if (Sz == VectorizableTree.size()) {
9511 IsVectorized = false;
9512 // Try non-interleaved vectorization with smaller vector
9513 // factor.
9514 if (InterleaveFactor > 0) {
9515 VF = 2 * (MaxVF / InterleaveFactor);
9516 InterleaveFactor = 0;
9517 }
9518 continue;
9519 }
9520 }
9521 if (IsVectorized)
9522 break;
9523 }
9524 }
9525 NonVectorized.append(SortedNonVectorized);
9526 }
9527 return NonVectorized;
9528 };
9529 for (const auto &GLs : GatheredLoads) {
9530 const auto &Ref = GLs.second;
9531 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
9532 if (!Ref.empty() && !NonVectorized.empty() &&
9533 std::accumulate(
9534 Ref.begin(), Ref.end(), 0u,
9535 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9536 -> unsigned { return S + LoadsDists.size(); }) !=
9537 NonVectorized.size() &&
9538 IsMaskedGatherSupported(NonVectorized)) {
9540 FinalGatheredLoads;
9541 for (LoadInst *LI : NonVectorized) {
9542 // Reinsert non-vectorized loads to other list of loads with the same
9543 // base pointers.
9544 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
9545 FinalGatheredLoads,
9546 /*AddNew=*/false);
9547 }
9548 // Final attempt to vectorize non-vectorized loads.
9549 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
9550 }
9551 }
9552 // Try to vectorize postponed load entries, previously marked as gathered.
9553 for (unsigned Idx : LoadEntriesToVectorize) {
9554 const TreeEntry &E = *VectorizableTree[Idx];
9555 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
9556 // Avoid reordering, if possible.
9557 if (!E.ReorderIndices.empty()) {
9558 // Build a mask out of the reorder indices and reorder scalars per this
9559 // mask.
9560 SmallVector<int> ReorderMask;
9561 inversePermutation(E.ReorderIndices, ReorderMask);
9562 reorderScalars(GatheredScalars, ReorderMask);
9563 }
9564 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9565 }
9566 // If no new entries created, consider it as no gathered loads entries must be
9567 // handled.
9568 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9569 VectorizableTree.size())
9570 GatheredLoadsEntriesFirst.reset();
9571}
9572
9573/// Generates key/subkey pair for the given value to provide effective sorting
9574/// of the values and better detection of the vectorizable values sequences. The
9575/// keys/subkeys can be used for better sorting of the values themselves (keys)
9576/// and in values subgroups (subkeys).
9577static std::pair<size_t, size_t> generateKeySubkey(
9578 Value *V, const TargetLibraryInfo *TLI,
9579 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
9580 bool AllowAlternate) {
9581 hash_code Key = hash_value(V->getValueID() + 2);
9582 hash_code SubKey = hash_value(0);
9583 // Sort the loads by the distance between the pointers.
9584 if (auto *LI = dyn_cast<LoadInst>(V)) {
9585 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
9586 if (LI->isSimple())
9587 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
9588 else
9589 Key = SubKey = hash_value(LI);
9590 } else if (isVectorLikeInstWithConstOps(V)) {
9591 // Sort extracts by the vector operands.
9593 Key = hash_value(Value::UndefValueVal + 1);
9594 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
9595 if (!isUndefVector(EI->getVectorOperand()).all() &&
9596 !isa<UndefValue>(EI->getIndexOperand()))
9597 SubKey = hash_value(EI->getVectorOperand());
9598 }
9599 } else if (auto *I = dyn_cast<Instruction>(V)) {
9600 // Sort other instructions just by the opcodes except for CMPInst.
9601 // For CMP also sort by the predicate kind.
9603 isValidForAlternation(I->getOpcode())) {
9604 if (AllowAlternate)
9605 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
9606 else
9607 Key = hash_combine(hash_value(I->getOpcode()), Key);
9608 SubKey = hash_combine(
9609 hash_value(I->getOpcode()), hash_value(I->getType()),
9611 ? I->getType()
9612 : cast<CastInst>(I)->getOperand(0)->getType()));
9613 // For casts, look through the only operand to improve compile time.
9614 if (isa<CastInst>(I)) {
9615 std::pair<size_t, size_t> OpVals =
9616 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
9617 /*AllowAlternate=*/true);
9618 Key = hash_combine(OpVals.first, Key);
9619 SubKey = hash_combine(OpVals.first, SubKey);
9620 }
9621 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
9622 CmpInst::Predicate Pred = CI->getPredicate();
9623 if (CI->isCommutative())
9624 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
9626 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
9627 hash_value(SwapPred),
9628 hash_value(CI->getOperand(0)->getType()));
9629 } else if (auto *Call = dyn_cast<CallInst>(I)) {
9632 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
9633 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
9634 SubKey = hash_combine(hash_value(I->getOpcode()),
9635 hash_value(Call->getCalledFunction()));
9636 } else {
9638 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
9639 }
9640 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
9641 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
9642 hash_value(Op.Tag), SubKey);
9643 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
9644 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9645 SubKey = hash_value(Gep->getPointerOperand());
9646 else
9647 SubKey = hash_value(Gep);
9648 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
9649 !isa<ConstantInt>(I->getOperand(1))) {
9650 // Do not try to vectorize instructions with potentially high cost.
9651 SubKey = hash_value(I);
9652 } else {
9653 SubKey = hash_value(I->getOpcode());
9654 }
9655 Key = hash_combine(hash_value(I->getParent()), Key);
9656 }
9657 return std::make_pair(Key, SubKey);
9658}
9659
9660/// Checks if the specified instruction \p I is an main operation for the given
9661/// \p MainOp and \p AltOp instructions.
9662static bool isMainInstruction(Instruction *I, Instruction *MainOp,
9663 Instruction *AltOp, const TargetLibraryInfo &TLI);
9664
9665bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
9666 ArrayRef<Value *> VL) const {
9667 Type *ScalarTy = S.getMainOp()->getType();
9668 unsigned Opcode0 = S.getOpcode();
9669 unsigned Opcode1 = S.getAltOpcode();
9670 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9671 // If this pattern is supported by the target then consider it profitable.
9672 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
9673 Opcode1, OpcodeMask))
9674 return true;
9676 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9677 Operands.emplace_back();
9678 // Prepare the operand vector.
9679 for (Value *V : VL) {
9680 if (isa<PoisonValue>(V)) {
9681 Operands.back().push_back(
9682 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
9683 continue;
9684 }
9685 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
9686 }
9687 }
9688 if (Operands.size() == 2) {
9689 // Try find best operands candidates.
9690 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9692 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
9693 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
9694 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
9695 std::optional<int> Res = findBestRootPair(Candidates);
9696 switch (Res.value_or(0)) {
9697 case 0:
9698 break;
9699 case 1:
9700 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
9701 break;
9702 case 2:
9703 std::swap(Operands[0][I], Operands[1][I]);
9704 break;
9705 default:
9706 llvm_unreachable("Unexpected index.");
9707 }
9708 }
9709 }
9710 DenseSet<unsigned> UniqueOpcodes;
9711 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
9712 unsigned NonInstCnt = 0;
9713 // Estimate number of instructions, required for the vectorized node and for
9714 // the buildvector node.
9715 unsigned UndefCnt = 0;
9716 // Count the number of extra shuffles, required for vector nodes.
9717 unsigned ExtraShuffleInsts = 0;
9718 // Check that operands do not contain same values and create either perfect
9719 // diamond match or shuffled match.
9720 if (Operands.size() == 2) {
9721 // Do not count same operands twice.
9722 if (Operands.front() == Operands.back()) {
9723 Operands.erase(Operands.begin());
9724 } else if (!allConstant(Operands.front()) &&
9725 all_of(Operands.front(), [&](Value *V) {
9726 return is_contained(Operands.back(), V);
9727 })) {
9728 Operands.erase(Operands.begin());
9729 ++ExtraShuffleInsts;
9730 }
9731 }
9732 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
9733 // Vectorize node, if:
9734 // 1. at least single operand is constant or splat.
9735 // 2. Operands have many loop invariants (the instructions are not loop
9736 // invariants).
9737 // 3. At least single unique operands is supposed to vectorized.
9738 return none_of(Operands,
9739 [&](ArrayRef<Value *> Op) {
9740 if (allConstant(Op) ||
9741 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
9742 getSameOpcode(Op, *TLI)))
9743 return false;
9744 DenseMap<Value *, unsigned> Uniques;
9745 for (Value *V : Op) {
9747 isVectorized(V) || (L && L->isLoopInvariant(V))) {
9748 if (isa<UndefValue>(V))
9749 ++UndefCnt;
9750 continue;
9751 }
9752 auto Res = Uniques.try_emplace(V, 0);
9753 // Found first duplicate - need to add shuffle.
9754 if (!Res.second && Res.first->second == 1)
9755 ++ExtraShuffleInsts;
9756 ++Res.first->getSecond();
9757 if (auto *I = dyn_cast<Instruction>(V))
9758 UniqueOpcodes.insert(I->getOpcode());
9759 else if (Res.second)
9760 ++NonInstCnt;
9761 }
9762 return none_of(Uniques, [&](const auto &P) {
9763 return P.first->hasNUsesOrMore(P.second + 1) &&
9764 none_of(P.first->users(), [&](User *U) {
9765 return isVectorized(U) || Uniques.contains(U);
9766 });
9767 });
9768 }) ||
9769 // Do not vectorize node, if estimated number of vector instructions is
9770 // more than estimated number of buildvector instructions. Number of
9771 // vector operands is number of vector instructions + number of vector
9772 // instructions for operands (buildvectors). Number of buildvector
9773 // instructions is just number_of_operands * number_of_scalars.
9774 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9775 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
9776 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9777}
9778
9779/// Builds the arguments types vector for the given call instruction with the
9780/// given \p ID for the specified vector factor.
9783 const unsigned VF, unsigned MinBW,
9784 const TargetTransformInfo *TTI) {
9785 SmallVector<Type *> ArgTys;
9786 for (auto [Idx, Arg] : enumerate(CI->args())) {
9789 ArgTys.push_back(Arg->getType());
9790 continue;
9791 }
9792 if (MinBW > 0) {
9793 ArgTys.push_back(
9794 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9795 continue;
9796 }
9797 }
9798 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9799 }
9800 return ArgTys;
9801}
9802
9803/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
9804/// function (if possible) calls. Returns invalid cost for the corresponding
9805/// calls, if they cannot be vectorized/will be scalarized.
9806static std::pair<InstructionCost, InstructionCost>
9809 ArrayRef<Type *> ArgTys) {
9810 auto Shape = VFShape::get(CI->getFunctionType(),
9812 false /*HasGlobalPred*/);
9813 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9814 auto LibCost = InstructionCost::getInvalid();
9815 if (!CI->isNoBuiltin() && VecFunc) {
9816 // Calculate the cost of the vector library call.
9817 // If the corresponding vector call is cheaper, return its cost.
9818 LibCost =
9819 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9820 }
9822
9823 // Calculate the cost of the vector intrinsic call.
9824 FastMathFlags FMF;
9825 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9826 FMF = FPCI->getFastMathFlags();
9827 const InstructionCost ScalarLimit = 10000;
9828 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
9829 LibCost.isValid() ? LibCost : ScalarLimit);
9830 auto IntrinsicCost =
9831 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
9832 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
9833 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
9835
9836 return {IntrinsicCost, LibCost};
9837}
9838
9839BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9840 const InstructionsState &S, ArrayRef<Value *> VL,
9841 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9842 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9843 assert(S.getMainOp() &&
9844 "Expected instructions with same/alternate opcodes only.");
9845
9846 unsigned ShuffleOrOp =
9847 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9848 Instruction *VL0 = S.getMainOp();
9849 switch (ShuffleOrOp) {
9850 case Instruction::PHI: {
9851 // Too many operands - gather, most probably won't be vectorized.
9852 if (VL0->getNumOperands() > MaxPHINumOperands)
9853 return TreeEntry::NeedToGather;
9854 // Check for terminator values (e.g. invoke).
9855 for (Value *V : VL) {
9856 auto *PHI = dyn_cast<PHINode>(V);
9857 if (!PHI)
9858 continue;
9859 for (Value *Incoming : PHI->incoming_values()) {
9861 if (Term && Term->isTerminator()) {
9863 << "SLP: Need to swizzle PHINodes (terminator use).\n");
9864 return TreeEntry::NeedToGather;
9865 }
9866 }
9867 }
9868
9869 return TreeEntry::Vectorize;
9870 }
9871 case Instruction::ExtractElement:
9872 if (any_of(VL, [&](Value *V) {
9873 auto *EI = dyn_cast<ExtractElementInst>(V);
9874 if (!EI)
9875 return true;
9876 return isVectorized(EI->getOperand(0));
9877 }))
9878 return TreeEntry::NeedToGather;
9879 [[fallthrough]];
9880 case Instruction::ExtractValue: {
9881 bool Reuse = canReuseExtract(VL, CurrentOrder);
9882 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
9883 // non-full registers).
9884 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
9885 return TreeEntry::NeedToGather;
9886 if (Reuse || !CurrentOrder.empty())
9887 return TreeEntry::Vectorize;
9888 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
9889 return TreeEntry::NeedToGather;
9890 }
9891 case Instruction::InsertElement: {
9892 // Check that we have a buildvector and not a shuffle of 2 or more
9893 // different vectors.
9894 ValueSet SourceVectors;
9895 for (Value *V : VL) {
9896 if (isa<PoisonValue>(V)) {
9897 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
9898 return TreeEntry::NeedToGather;
9899 }
9900 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
9901 assert(getElementIndex(V) != std::nullopt &&
9902 "Non-constant or undef index?");
9903 }
9904
9905 if (count_if(VL, [&SourceVectors](Value *V) {
9906 return !SourceVectors.contains(V);
9907 }) >= 2) {
9908 // Found 2nd source vector - cancel.
9909 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9910 "different source vectors.\n");
9911 return TreeEntry::NeedToGather;
9912 }
9913
9914 if (any_of(VL, [&SourceVectors](Value *V) {
9915 // The last InsertElement can have multiple uses.
9916 return SourceVectors.contains(V) && !V->hasOneUse();
9917 })) {
9918 assert(SLPReVec && "Only supported by REVEC.");
9919 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9920 "multiple uses.\n");
9921 return TreeEntry::NeedToGather;
9922 }
9923
9924 return TreeEntry::Vectorize;
9925 }
9926 case Instruction::Load: {
9927 // Check that a vectorized load would load the same memory as a scalar
9928 // load. For example, we don't want to vectorize loads that are smaller
9929 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
9930 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
9931 // from such a struct, we read/write packed bits disagreeing with the
9932 // unvectorized version.
9933 auto IsGatheredNode = [&]() {
9934 if (!GatheredLoadsEntriesFirst)
9935 return false;
9936 return all_of(VL, [&](Value *V) {
9937 if (isa<PoisonValue>(V))
9938 return true;
9939 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
9940 return TE->Idx >= *GatheredLoadsEntriesFirst;
9941 });
9942 });
9943 };
9944 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
9946 return TreeEntry::Vectorize;
9948 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9949 // Delay slow vectorized nodes for better vectorization attempts.
9950 LoadEntriesToVectorize.insert(VectorizableTree.size());
9951 return TreeEntry::NeedToGather;
9952 }
9953 return IsGatheredNode() ? TreeEntry::NeedToGather
9954 : TreeEntry::CompressVectorize;
9956 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9957 // Delay slow vectorized nodes for better vectorization attempts.
9958 LoadEntriesToVectorize.insert(VectorizableTree.size());
9959 return TreeEntry::NeedToGather;
9960 }
9961 return IsGatheredNode() ? TreeEntry::NeedToGather
9962 : TreeEntry::ScatterVectorize;
9964 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9965 // Delay slow vectorized nodes for better vectorization attempts.
9966 LoadEntriesToVectorize.insert(VectorizableTree.size());
9967 return TreeEntry::NeedToGather;
9968 }
9969 return IsGatheredNode() ? TreeEntry::NeedToGather
9970 : TreeEntry::StridedVectorize;
9971 case LoadsState::Gather:
9972#ifndef NDEBUG
9973 Type *ScalarTy = VL0->getType();
9974 if (DL->getTypeSizeInBits(ScalarTy) !=
9975 DL->getTypeAllocSizeInBits(ScalarTy))
9976 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
9977 else if (any_of(VL, [](Value *V) {
9978 auto *LI = dyn_cast<LoadInst>(V);
9979 return !LI || !LI->isSimple();
9980 }))
9981 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
9982 else
9983 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
9984#endif // NDEBUG
9986 return TreeEntry::NeedToGather;
9987 }
9988 llvm_unreachable("Unexpected state of loads");
9989 }
9990 case Instruction::ZExt:
9991 case Instruction::SExt:
9992 case Instruction::FPToUI:
9993 case Instruction::FPToSI:
9994 case Instruction::FPExt:
9995 case Instruction::PtrToInt:
9996 case Instruction::IntToPtr:
9997 case Instruction::SIToFP:
9998 case Instruction::UIToFP:
9999 case Instruction::Trunc:
10000 case Instruction::FPTrunc:
10001 case Instruction::BitCast: {
10002 Type *SrcTy = VL0->getOperand(0)->getType();
10003 for (Value *V : VL) {
10004 if (isa<PoisonValue>(V))
10005 continue;
10006 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10007 if (Ty != SrcTy || !isValidElementType(Ty)) {
10008 LLVM_DEBUG(
10009 dbgs() << "SLP: Gathering casts with different src types.\n");
10010 return TreeEntry::NeedToGather;
10011 }
10012 }
10013 return TreeEntry::Vectorize;
10014 }
10015 case Instruction::ICmp:
10016 case Instruction::FCmp: {
10017 // Check that all of the compares have the same predicate.
10018 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10020 Type *ComparedTy = VL0->getOperand(0)->getType();
10021 for (Value *V : VL) {
10022 if (isa<PoisonValue>(V))
10023 continue;
10024 auto *Cmp = cast<CmpInst>(V);
10025 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10026 Cmp->getOperand(0)->getType() != ComparedTy) {
10027 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10028 return TreeEntry::NeedToGather;
10029 }
10030 }
10031 return TreeEntry::Vectorize;
10032 }
10033 case Instruction::Select:
10034 case Instruction::FNeg:
10035 case Instruction::Add:
10036 case Instruction::FAdd:
10037 case Instruction::Sub:
10038 case Instruction::FSub:
10039 case Instruction::Mul:
10040 case Instruction::FMul:
10041 case Instruction::UDiv:
10042 case Instruction::SDiv:
10043 case Instruction::FDiv:
10044 case Instruction::URem:
10045 case Instruction::SRem:
10046 case Instruction::FRem:
10047 case Instruction::Shl:
10048 case Instruction::LShr:
10049 case Instruction::AShr:
10050 case Instruction::And:
10051 case Instruction::Or:
10052 case Instruction::Xor:
10053 case Instruction::Freeze:
10054 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10055 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10056 auto *I = dyn_cast<Instruction>(V);
10057 return I && I->isBinaryOp() && !I->isFast();
10058 }))
10059 return TreeEntry::NeedToGather;
10060 return TreeEntry::Vectorize;
10061 case Instruction::GetElementPtr: {
10062 // We don't combine GEPs with complicated (nested) indexing.
10063 for (Value *V : VL) {
10064 auto *I = dyn_cast<GetElementPtrInst>(V);
10065 if (!I)
10066 continue;
10067 if (I->getNumOperands() != 2) {
10068 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10069 return TreeEntry::NeedToGather;
10070 }
10071 }
10072
10073 // We can't combine several GEPs into one vector if they operate on
10074 // different types.
10075 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10076 for (Value *V : VL) {
10077 auto *GEP = dyn_cast<GEPOperator>(V);
10078 if (!GEP)
10079 continue;
10080 Type *CurTy = GEP->getSourceElementType();
10081 if (Ty0 != CurTy) {
10082 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10083 return TreeEntry::NeedToGather;
10084 }
10085 }
10086
10087 // We don't combine GEPs with non-constant indexes.
10088 Type *Ty1 = VL0->getOperand(1)->getType();
10089 for (Value *V : VL) {
10090 auto *I = dyn_cast<GetElementPtrInst>(V);
10091 if (!I)
10092 continue;
10093 auto *Op = I->getOperand(1);
10094 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10095 (Op->getType() != Ty1 &&
10096 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10097 Op->getType()->getScalarSizeInBits() >
10098 DL->getIndexSizeInBits(
10099 V->getType()->getPointerAddressSpace())))) {
10100 LLVM_DEBUG(
10101 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10102 return TreeEntry::NeedToGather;
10103 }
10104 }
10105
10106 return TreeEntry::Vectorize;
10107 }
10108 case Instruction::Store: {
10109 // Check if the stores are consecutive or if we need to swizzle them.
10110 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10111 // Avoid types that are padded when being allocated as scalars, while
10112 // being packed together in a vector (such as i1).
10113 if (DL->getTypeSizeInBits(ScalarTy) !=
10114 DL->getTypeAllocSizeInBits(ScalarTy)) {
10115 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10116 return TreeEntry::NeedToGather;
10117 }
10118 // Make sure all stores in the bundle are simple - we can't vectorize
10119 // atomic or volatile stores.
10120 for (Value *V : VL) {
10121 auto *SI = cast<StoreInst>(V);
10122 if (!SI->isSimple()) {
10123 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10124 return TreeEntry::NeedToGather;
10125 }
10126 PointerOps.push_back(SI->getPointerOperand());
10127 }
10128
10129 // Check the order of pointer operands.
10130 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10131 Value *Ptr0;
10132 Value *PtrN;
10133 if (CurrentOrder.empty()) {
10134 Ptr0 = PointerOps.front();
10135 PtrN = PointerOps.back();
10136 } else {
10137 Ptr0 = PointerOps[CurrentOrder.front()];
10138 PtrN = PointerOps[CurrentOrder.back()];
10139 }
10140 std::optional<int64_t> Dist =
10141 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10142 // Check that the sorted pointer operands are consecutive.
10143 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10144 return TreeEntry::Vectorize;
10145 }
10146
10147 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10148 return TreeEntry::NeedToGather;
10149 }
10150 case Instruction::Call: {
10151 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10152 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10153 auto *I = dyn_cast<Instruction>(V);
10154 return I && !I->isFast();
10155 }))
10156 return TreeEntry::NeedToGather;
10157 // Check if the calls are all to the same vectorizable intrinsic or
10158 // library function.
10159 CallInst *CI = cast<CallInst>(VL0);
10161
10162 VFShape Shape = VFShape::get(
10163 CI->getFunctionType(),
10164 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10165 false /*HasGlobalPred*/);
10166 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10167
10168 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10169 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10170 return TreeEntry::NeedToGather;
10171 }
10172 Function *F = CI->getCalledFunction();
10173 unsigned NumArgs = CI->arg_size();
10174 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10175 for (unsigned J = 0; J != NumArgs; ++J)
10177 ScalarArgs[J] = CI->getArgOperand(J);
10178 for (Value *V : VL) {
10179 CallInst *CI2 = dyn_cast<CallInst>(V);
10180 if (!CI2 || CI2->getCalledFunction() != F ||
10181 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10182 (VecFunc &&
10183 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10185 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10186 << "\n");
10187 return TreeEntry::NeedToGather;
10188 }
10189 // Some intrinsics have scalar arguments and should be same in order for
10190 // them to be vectorized.
10191 for (unsigned J = 0; J != NumArgs; ++J) {
10193 Value *A1J = CI2->getArgOperand(J);
10194 if (ScalarArgs[J] != A1J) {
10196 << "SLP: mismatched arguments in call:" << *CI
10197 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10198 return TreeEntry::NeedToGather;
10199 }
10200 }
10201 }
10202 // Verify that the bundle operands are identical between the two calls.
10203 if (CI->hasOperandBundles() &&
10204 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10205 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10206 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10207 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10208 << "!=" << *V << '\n');
10209 return TreeEntry::NeedToGather;
10210 }
10211 }
10212 SmallVector<Type *> ArgTys =
10213 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10214 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10215 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10216 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10217 return TreeEntry::NeedToGather;
10218
10219 return TreeEntry::Vectorize;
10220 }
10221 case Instruction::ShuffleVector: {
10222 if (!S.isAltShuffle()) {
10223 // REVEC can support non alternate shuffle.
10225 return TreeEntry::Vectorize;
10226 // If this is not an alternate sequence of opcode like add-sub
10227 // then do not vectorize this instruction.
10228 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10229 return TreeEntry::NeedToGather;
10230 }
10231 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10232 LLVM_DEBUG(
10233 dbgs()
10234 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10235 "the whole alt sequence is not profitable.\n");
10236 return TreeEntry::NeedToGather;
10237 }
10238
10239 return TreeEntry::Vectorize;
10240 }
10241 default:
10242 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10243 return TreeEntry::NeedToGather;
10244 }
10245}
10246
10247namespace {
10248/// Allows to correctly handle operands of the phi nodes based on the \p Main
10249/// PHINode order of incoming basic blocks/values.
10250class PHIHandler {
10251 DominatorTree &DT;
10252 PHINode *Main = nullptr;
10255
10256public:
10257 PHIHandler() = delete;
10258 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10259 : DT(DT), Main(Main), Phis(Phis),
10260 Operands(Main->getNumIncomingValues(),
10261 SmallVector<Value *>(Phis.size(), nullptr)) {}
10262 void buildOperands() {
10263 constexpr unsigned FastLimit = 4;
10264 if (Main->getNumIncomingValues() <= FastLimit) {
10265 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10266 BasicBlock *InBB = Main->getIncomingBlock(I);
10267 if (!DT.isReachableFromEntry(InBB)) {
10268 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10269 continue;
10270 }
10271 // Prepare the operand vector.
10272 for (auto [Idx, V] : enumerate(Phis)) {
10273 auto *P = dyn_cast<PHINode>(V);
10274 if (!P) {
10276 "Expected isa instruction or poison value.");
10277 Operands[I][Idx] = V;
10278 continue;
10279 }
10280 if (P->getIncomingBlock(I) == InBB)
10281 Operands[I][Idx] = P->getIncomingValue(I);
10282 else
10283 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10284 }
10285 }
10286 return;
10287 }
10288 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10289 Blocks;
10290 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10291 BasicBlock *InBB = Main->getIncomingBlock(I);
10292 if (!DT.isReachableFromEntry(InBB)) {
10293 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10294 continue;
10295 }
10296 Blocks.try_emplace(InBB).first->second.push_back(I);
10297 }
10298 for (auto [Idx, V] : enumerate(Phis)) {
10299 if (isa<PoisonValue>(V)) {
10300 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10301 Operands[I][Idx] = V;
10302 continue;
10303 }
10304 auto *P = cast<PHINode>(V);
10305 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10306 BasicBlock *InBB = P->getIncomingBlock(I);
10307 if (InBB == Main->getIncomingBlock(I)) {
10309 continue;
10310 Operands[I][Idx] = P->getIncomingValue(I);
10311 continue;
10312 }
10313 auto *It = Blocks.find(InBB);
10314 if (It == Blocks.end())
10315 continue;
10316 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10317 }
10318 }
10319 for (const auto &P : Blocks) {
10320 ArrayRef<unsigned> IncomingValues = P.second;
10321 if (IncomingValues.size() <= 1)
10322 continue;
10323 unsigned BasicI = IncomingValues.consume_front();
10324 for (unsigned I : IncomingValues) {
10326 [&](const auto &Data) {
10327 return !Data.value() ||
10328 Data.value() == Operands[BasicI][Data.index()];
10329 }) &&
10330 "Expected empty operands list.");
10331 Operands[I] = Operands[BasicI];
10332 }
10333 }
10334 }
10335 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10336};
10337} // namespace
10338
10339/// Returns main/alternate instructions for the given \p VL. Unlike
10340/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10341/// node support.
10342/// \returns first main/alt instructions, if only poisons and instruction with
10343/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10344static std::pair<Instruction *, Instruction *>
10346 Instruction *MainOp = nullptr;
10347 Instruction *AltOp = nullptr;
10348 for (Value *V : VL) {
10349 if (isa<PoisonValue>(V))
10350 continue;
10351 auto *I = dyn_cast<Instruction>(V);
10352 if (!I)
10353 return {};
10354 if (!MainOp) {
10355 MainOp = I;
10356 continue;
10357 }
10358 if (MainOp->getOpcode() == I->getOpcode()) {
10359 if (I->getParent() != MainOp->getParent())
10360 return {};
10361 continue;
10362 }
10363 if (!AltOp) {
10364 AltOp = I;
10365 continue;
10366 }
10367 if (AltOp->getOpcode() == I->getOpcode()) {
10368 if (I->getParent() != AltOp->getParent())
10369 return {};
10370 continue;
10371 }
10372 return {};
10373 }
10374 if (!AltOp)
10375 return {};
10376 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10377 "Expected different main and alt instructions.");
10378 return std::make_pair(MainOp, AltOp);
10379}
10380
10381/// Checks that every instruction appears once in the list and if not, packs
10382/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10383/// unique scalars is extended by poison values to the whole register size.
10384///
10385/// \returns false if \p VL could not be uniquified, in which case \p VL is
10386/// unchanged and \p ReuseShuffleIndices is empty.
10388 SmallVectorImpl<int> &ReuseShuffleIndices,
10389 const TargetTransformInfo &TTI,
10390 const TargetLibraryInfo &TLI,
10391 const InstructionsState &S,
10392 const BoUpSLP::EdgeInfo &UserTreeIdx,
10393 bool TryPad = false) {
10394 // Check that every instruction appears once in this bundle.
10395 SmallVector<Value *> UniqueValues;
10396 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10397 for (Value *V : VL) {
10398 if (isConstant(V)) {
10399 // Constants are always considered distinct, even if the same constant
10400 // appears multiple times in VL.
10401 ReuseShuffleIndices.emplace_back(
10402 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10403 UniqueValues.emplace_back(V);
10404 continue;
10405 }
10406 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10407 ReuseShuffleIndices.emplace_back(Res.first->second);
10408 if (Res.second)
10409 UniqueValues.emplace_back(V);
10410 }
10411
10412 // Easy case: VL has unique values and a "natural" size
10413 size_t NumUniqueScalarValues = UniqueValues.size();
10414 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10415 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10416 if (NumUniqueScalarValues == VL.size() &&
10417 (VectorizeNonPowerOf2 || IsFullVectors)) {
10418 ReuseShuffleIndices.clear();
10419 return true;
10420 }
10421
10422 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10423 if ((UserTreeIdx.UserTE &&
10424 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10426 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10427 "for nodes with padding.\n");
10428 ReuseShuffleIndices.clear();
10429 return false;
10430 }
10431
10432 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10433 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10434 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10435 return isa<UndefValue>(V) || !isConstant(V);
10436 }))) {
10437 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10438 S.getMainOp()->isSafeToRemove() &&
10439 (S.areInstructionsWithCopyableElements() ||
10440 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10441 // Find the number of elements, which forms full vectors.
10442 unsigned PWSz = getFullVectorNumberOfElements(
10443 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10444 PWSz = std::min<unsigned>(PWSz, VL.size());
10445 if (PWSz == VL.size()) {
10446 // We ended up with the same size after removing duplicates and
10447 // upgrading the resulting vector size to a "nice size". Just keep
10448 // the initial VL then.
10449 ReuseShuffleIndices.clear();
10450 } else {
10451 // Pad unique values with poison to grow the vector to a "nice" size
10452 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10453 UniqueValues.end());
10454 PaddedUniqueValues.append(
10455 PWSz - UniqueValues.size(),
10456 PoisonValue::get(UniqueValues.front()->getType()));
10457 // Check that extended with poisons/copyable operations are still valid
10458 // for vectorization (div/rem are not allowed).
10459 if (!S.areInstructionsWithCopyableElements() &&
10460 !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
10461 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10462 ReuseShuffleIndices.clear();
10463 return false;
10464 }
10465 VL = std::move(PaddedUniqueValues);
10466 }
10467 return true;
10468 }
10469 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10470 ReuseShuffleIndices.clear();
10471 return false;
10472 }
10473 VL = std::move(UniqueValues);
10474 return true;
10475}
10476
10477bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10478 const InstructionsState &LocalState,
10479 SmallVectorImpl<Value *> &Op1,
10480 SmallVectorImpl<Value *> &Op2,
10481 OrdersType &ReorderIndices) const {
10482 constexpr unsigned SmallNodeSize = 4;
10483 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10485 return false;
10486
10487 // Check if this is a duplicate of another split entry.
10488 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10489 << ".\n");
10490 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10491 if (E->isSame(VL)) {
10492 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10493 << *LocalState.getMainOp() << ".\n");
10494 return false;
10495 }
10496 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10497 if (all_of(VL, [&](Value *V) {
10498 return isa<PoisonValue>(V) || Values.contains(V);
10499 })) {
10500 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10501 return false;
10502 }
10503 }
10504
10505 ReorderIndices.assign(VL.size(), VL.size());
10506 SmallBitVector Op1Indices(VL.size());
10507 for (auto [Idx, V] : enumerate(VL)) {
10508 auto *I = dyn_cast<Instruction>(V);
10509 if (!I) {
10510 Op1.push_back(V);
10511 Op1Indices.set(Idx);
10512 continue;
10513 }
10514 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10515 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
10516 *TLI)) ||
10517 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10518 !isAlternateInstruction(I, LocalState.getMainOp(),
10519 LocalState.getAltOp(), *TLI))) {
10520 Op1.push_back(V);
10521 Op1Indices.set(Idx);
10522 continue;
10523 }
10524 Op2.push_back(V);
10525 }
10526 Type *ScalarTy = getValueType(VL.front());
10527 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
10528 unsigned Opcode0 = LocalState.getOpcode();
10529 unsigned Opcode1 = LocalState.getAltOpcode();
10530 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10531 // Enable split node, only if all nodes do not form legal alternate
10532 // instruction (like X86 addsub).
10533 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
10534 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
10535 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10536 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10537 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
10538 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
10539 return false;
10540 // Enable split node, only if all nodes are power-of-2/full registers.
10541 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
10542 for (unsigned Idx : seq<unsigned>(VL.size())) {
10543 if (Op1Indices.test(Idx)) {
10544 ReorderIndices[Op1Cnt] = Idx;
10545 ++Op1Cnt;
10546 } else {
10547 ReorderIndices[Op2Cnt] = Idx;
10548 ++Op2Cnt;
10549 }
10550 }
10551 if (isIdentityOrder(ReorderIndices))
10552 ReorderIndices.clear();
10553 SmallVector<int> Mask;
10554 if (!ReorderIndices.empty())
10555 inversePermutation(ReorderIndices, Mask);
10556 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10557 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
10558 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
10559 // Check non-profitable single register ops, which better to be represented
10560 // as alternate ops.
10561 if (NumParts >= VL.size())
10562 return false;
10564 InstructionCost InsertCost = ::getShuffleCost(
10565 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
10566 FixedVectorType *SubVecTy =
10567 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
10568 InstructionCost NewShuffleCost =
10569 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
10570 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10571 (Mask.empty() || InsertCost >= NewShuffleCost))
10572 return false;
10573 if ((LocalState.getMainOp()->isBinaryOp() &&
10574 LocalState.getAltOp()->isBinaryOp() &&
10575 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10576 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10577 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10578 (LocalState.getMainOp()->isUnaryOp() &&
10579 LocalState.getAltOp()->isUnaryOp())) {
10580 InstructionCost OriginalVecOpsCost =
10581 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10582 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10583 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
10584 for (unsigned Idx : seq<unsigned>(VL.size())) {
10585 if (isa<PoisonValue>(VL[Idx]))
10586 continue;
10587 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
10588 }
10589 InstructionCost OriginalCost =
10590 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
10591 VecTy, OriginalMask, Kind);
10592 InstructionCost NewVecOpsCost =
10593 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10594 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10595 InstructionCost NewCost =
10596 NewVecOpsCost + InsertCost +
10597 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10598 VectorizableTree.front()->getOpcode() == Instruction::Store
10599 ? NewShuffleCost
10600 : 0);
10601 // If not profitable to split - exit.
10602 if (NewCost >= OriginalCost)
10603 return false;
10604 }
10605 return true;
10606}
10607
10608namespace {
10609/// Class accepts incoming list of values, checks if it is able to model
10610/// "copyable" values as compatible operations, and generates the list of values
10611/// for scheduling and list of operands doe the new nodes.
10612class InstructionsCompatibilityAnalysis {
10613 DominatorTree &DT;
10614 const DataLayout &DL;
10615 const TargetTransformInfo &TTI;
10616 const TargetLibraryInfo &TLI;
10617 unsigned MainOpcode = 0;
10618 Instruction *MainOp = nullptr;
10619
10620 /// Checks if the opcode is supported as the main opcode for copyable
10621 /// elements.
10622 static bool isSupportedOpcode(const unsigned Opcode) {
10623 return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10624 }
10625
10626 /// Identifies the best candidate value, which represents main opcode
10627 /// operation.
10628 /// Currently the best candidate is the Add instruction with the parent
10629 /// block with the highest DFS incoming number (block, that dominates other).
10630 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
10631 BasicBlock *Parent = nullptr;
10632 // Checks if the instruction has supported opcode.
10633 auto IsSupportedInstruction = [&](Instruction *I) {
10634 return I && isSupportedOpcode(I->getOpcode()) &&
10635 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
10636 };
10637 // Exclude operands instructions immediately to improve compile time, it
10638 // will be unable to schedule anyway.
10639 SmallDenseSet<Value *, 8> Operands;
10640 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10641 for (Value *V : VL) {
10642 auto *I = dyn_cast<Instruction>(V);
10643 if (!I)
10644 continue;
10645 if (!DT.isReachableFromEntry(I->getParent()))
10646 continue;
10647 if (Candidates.empty()) {
10648 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10649 Parent = I->getParent();
10650 Operands.insert(I->op_begin(), I->op_end());
10651 continue;
10652 }
10653 if (Parent == I->getParent()) {
10654 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10655 Operands.insert(I->op_begin(), I->op_end());
10656 continue;
10657 }
10658 auto *NodeA = DT.getNode(Parent);
10659 auto *NodeB = DT.getNode(I->getParent());
10660 assert(NodeA && "Should only process reachable instructions");
10661 assert(NodeB && "Should only process reachable instructions");
10662 assert((NodeA == NodeB) ==
10663 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10664 "Different nodes should have different DFS numbers");
10665 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10666 Candidates.clear();
10667 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10668 Parent = I->getParent();
10669 Operands.clear();
10670 Operands.insert(I->op_begin(), I->op_end());
10671 }
10672 }
10673 unsigned BestOpcodeNum = 0;
10674 MainOp = nullptr;
10675 for (const auto &P : Candidates) {
10676 if (P.second.size() < BestOpcodeNum)
10677 continue;
10678 for (Instruction *I : P.second) {
10679 if (IsSupportedInstruction(I) && !Operands.contains(I)) {
10680 MainOp = I;
10681 BestOpcodeNum = P.second.size();
10682 break;
10683 }
10684 }
10685 }
10686 if (MainOp) {
10687 // Do not match, if any copyable is a terminator from the same block as
10688 // the main operation.
10689 if (any_of(VL, [&](Value *V) {
10690 auto *I = dyn_cast<Instruction>(V);
10691 return I && I->getParent() == MainOp->getParent() &&
10692 I->isTerminator();
10693 })) {
10694 MainOp = nullptr;
10695 return;
10696 }
10697 MainOpcode = MainOp->getOpcode();
10698 }
10699 }
10700
10701 /// Returns the idempotent value for the \p MainOp with the detected \p
10702 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
10703 /// the operand itself, since V or V == V.
10704 Value *selectBestIdempotentValue() const {
10705 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10706 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
10707 !MainOp->isCommutative());
10708 }
10709
10710 /// Returns the value and operands for the \p V, considering if it is original
10711 /// instruction and its actual operands should be returned, or it is a
10712 /// copyable element and its should be represented as idempotent instruction.
10713 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
10714 if (isa<PoisonValue>(V))
10715 return {V, V};
10716 if (!S.isCopyableElement(V))
10717 return convertTo(cast<Instruction>(V), S).second;
10718 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10719 return {V, selectBestIdempotentValue()};
10720 }
10721
10722 /// Builds operands for the original instructions.
10723 void
10724 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
10725 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
10726
10727 unsigned ShuffleOrOp =
10728 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10729 Instruction *VL0 = S.getMainOp();
10730
10731 switch (ShuffleOrOp) {
10732 case Instruction::PHI: {
10733 auto *PH = cast<PHINode>(VL0);
10734
10735 // Keeps the reordered operands to avoid code duplication.
10736 PHIHandler Handler(DT, PH, VL);
10737 Handler.buildOperands();
10738 Operands.assign(PH->getNumOperands(), {});
10739 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
10740 Operands[I].assign(Handler.getOperands(I).begin(),
10741 Handler.getOperands(I).end());
10742 return;
10743 }
10744 case Instruction::ExtractValue:
10745 case Instruction::ExtractElement:
10746 // This is a special case, as it does not gather, but at the same time
10747 // we are not extending buildTree_rec() towards the operands.
10748 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
10749 return;
10750 case Instruction::InsertElement:
10751 Operands.assign(2, {VL.size(), nullptr});
10752 for (auto [Idx, V] : enumerate(VL)) {
10753 auto *IE = cast<InsertElementInst>(V);
10754 for (auto [OpIdx, Ops] : enumerate(Operands))
10755 Ops[Idx] = IE->getOperand(OpIdx);
10756 }
10757 return;
10758 case Instruction::Load:
10759 Operands.assign(
10760 1, {VL.size(),
10761 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
10762 for (auto [V, Op] : zip(VL, Operands.back())) {
10763 auto *LI = dyn_cast<LoadInst>(V);
10764 if (!LI)
10765 continue;
10766 Op = LI->getPointerOperand();
10767 }
10768 return;
10769 case Instruction::ZExt:
10770 case Instruction::SExt:
10771 case Instruction::FPToUI:
10772 case Instruction::FPToSI:
10773 case Instruction::FPExt:
10774 case Instruction::PtrToInt:
10775 case Instruction::IntToPtr:
10776 case Instruction::SIToFP:
10777 case Instruction::UIToFP:
10778 case Instruction::Trunc:
10779 case Instruction::FPTrunc:
10780 case Instruction::BitCast:
10781 case Instruction::ICmp:
10782 case Instruction::FCmp:
10783 case Instruction::Select:
10784 case Instruction::FNeg:
10785 case Instruction::Add:
10786 case Instruction::FAdd:
10787 case Instruction::Sub:
10788 case Instruction::FSub:
10789 case Instruction::Mul:
10790 case Instruction::FMul:
10791 case Instruction::UDiv:
10792 case Instruction::SDiv:
10793 case Instruction::FDiv:
10794 case Instruction::URem:
10795 case Instruction::SRem:
10796 case Instruction::FRem:
10797 case Instruction::Shl:
10798 case Instruction::LShr:
10799 case Instruction::AShr:
10800 case Instruction::And:
10801 case Instruction::Or:
10802 case Instruction::Xor:
10803 case Instruction::Freeze:
10804 case Instruction::Store:
10805 case Instruction::ShuffleVector:
10806 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
10807 for (auto [Idx, V] : enumerate(VL)) {
10808 auto *I = dyn_cast<Instruction>(V);
10809 if (!I) {
10810 for (auto [OpIdx, Ops] : enumerate(Operands))
10811 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
10812 continue;
10813 }
10814 auto [Op, ConvertedOps] = convertTo(I, S);
10815 for (auto [OpIdx, Ops] : enumerate(Operands))
10816 Ops[Idx] = ConvertedOps[OpIdx];
10817 }
10818 return;
10819 case Instruction::GetElementPtr: {
10820 Operands.assign(2, {VL.size(), nullptr});
10821 // Need to cast all indices to the same type before vectorization to
10822 // avoid crash.
10823 // Required to be able to find correct matches between different gather
10824 // nodes and reuse the vectorized values rather than trying to gather them
10825 // again.
10826 const unsigned IndexIdx = 1;
10827 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
10828 Type *Ty =
10829 all_of(VL,
10830 [&](Value *V) {
10832 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
10833 })
10834 ? VL0Ty
10835 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
10836 ->getPointerOperandType()
10837 ->getScalarType());
10838 for (auto [Idx, V] : enumerate(VL)) {
10840 if (!GEP) {
10841 Operands[0][Idx] = V;
10842 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10843 continue;
10844 }
10845 Operands[0][Idx] = GEP->getPointerOperand();
10846 auto *Op = GEP->getOperand(IndexIdx);
10847 auto *CI = dyn_cast<ConstantInt>(Op);
10848 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
10849 CI, Ty, CI->getValue().isSignBitSet(), DL)
10850 : Op;
10851 }
10852 return;
10853 }
10854 case Instruction::Call: {
10855 auto *CI = cast<CallInst>(VL0);
10857 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
10859 continue;
10860 auto &Ops = Operands.emplace_back();
10861 for (Value *V : VL) {
10862 auto *I = dyn_cast<Instruction>(V);
10863 Ops.push_back(I ? I->getOperand(Idx)
10864 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
10865 }
10866 }
10867 return;
10868 }
10869 default:
10870 break;
10871 }
10872 llvm_unreachable("Unexpected vectorization of the instructions.");
10873 }
10874
10875public:
10876 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
10877 const TargetTransformInfo &TTI,
10878 const TargetLibraryInfo &TLI)
10879 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
10880
10881 InstructionsState
10882 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
10883 bool TryCopyableElementsVectorization,
10884 bool WithProfitabilityCheck = false,
10885 bool SkipSameCodeCheck = false) {
10886 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
10887 ? InstructionsState::invalid()
10888 : getSameOpcode(VL, TLI);
10889 if (S)
10890 return S;
10891 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
10892 return S;
10893 findAndSetMainInstruction(VL, R);
10894 if (!MainOp)
10895 return InstructionsState::invalid();
10896 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
10897 if (!WithProfitabilityCheck)
10898 return S;
10899 // Check if it is profitable to vectorize the instruction.
10900 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
10901 auto BuildCandidates =
10902 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
10903 Value *V2) {
10904 if (V1 != V2 && isa<PHINode>(V1))
10905 return;
10906 auto *I1 = dyn_cast<Instruction>(V1);
10907 auto *I2 = dyn_cast<Instruction>(V2);
10908 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
10909 I1->getParent() != I2->getParent())
10910 return;
10911 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
10912 };
10913 if (VL.size() == 2) {
10914 // Check if the operands allow better vectorization.
10915 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
10916 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
10917 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
10918 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
10919 R.findBestRootPair(Candidates1) &&
10920 R.findBestRootPair(Candidates2);
10921 if (!Res && isCommutative(MainOp)) {
10922 Candidates1.clear();
10923 Candidates2.clear();
10924 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
10925 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
10926 Res = !Candidates1.empty() && !Candidates2.empty() &&
10927 R.findBestRootPair(Candidates1) &&
10928 R.findBestRootPair(Candidates2);
10929 }
10930 if (!Res)
10931 return InstructionsState::invalid();
10933 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
10934 InstructionCost VectorCost;
10935 FixedVectorType *VecTy =
10936 getWidenedType(S.getMainOp()->getType(), VL.size());
10937 switch (MainOpcode) {
10938 case Instruction::Add:
10939 case Instruction::LShr:
10940 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
10941 break;
10942 default:
10943 llvm_unreachable("Unexpected instruction.");
10944 }
10945 if (VectorCost > ScalarCost)
10946 return InstructionsState::invalid();
10947 return S;
10948 }
10949 assert(Operands.size() == 2 && "Unexpected number of operands!");
10950 unsigned CopyableNum =
10951 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
10952 if (CopyableNum < VL.size() / 2)
10953 return S;
10954 // Too many phi copyables - exit.
10955 const unsigned Limit = VL.size() / 24;
10956 if ((CopyableNum >= VL.size() - Limit ||
10957 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
10958 CopyableNum >= MaxPHINumOperands) &&
10959 all_of(VL, [&](Value *V) {
10960 return isa<PHINode>(V) || !S.isCopyableElement(V);
10961 }))
10962 return InstructionsState::invalid();
10963 // Check profitability if number of copyables > VL.size() / 2.
10964 // 1. Reorder operands for better matching.
10965 if (isCommutative(MainOp)) {
10966 for (auto &Ops : Operands) {
10967 // Make instructions the first operands.
10968 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
10969 std::swap(Ops.front(), Ops.back());
10970 continue;
10971 }
10972 // Make constants the second operands.
10973 if (isa<Constant>(Ops.front())) {
10974 std::swap(Ops.front(), Ops.back());
10975 continue;
10976 }
10977 }
10978 }
10979 // 2. Check, if operands can be vectorized.
10980 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
10981 return InstructionsState::invalid();
10982 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
10983 if (allConstant(Ops) || isSplat(Ops))
10984 return true;
10985 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
10986 // one is different.
10987 constexpr unsigned Limit = 4;
10988 if (Operands.front().size() >= Limit) {
10989 SmallDenseMap<const Value *, unsigned> Counters;
10990 for (Value *V : Ops) {
10991 if (isa<UndefValue>(V))
10992 continue;
10993 ++Counters[V];
10994 }
10995 if (Counters.size() == 2 &&
10996 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
10997 return C.second == 1;
10998 }))
10999 return true;
11000 }
11001 // First operand not a constant or splat? Last attempt - check for
11002 // potential vectorization.
11003 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11004 InstructionsState OpS = Analysis.buildInstructionsState(
11005 Ops, R, /*TryCopyableElementsVectorization=*/true);
11006 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11007 return false;
11008 unsigned CopyableNum =
11009 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11010 return CopyableNum <= VL.size() / 2;
11011 };
11012 if (!CheckOperand(Operands.front()))
11013 return InstructionsState::invalid();
11014
11015 return S;
11016 }
11017
11018 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11019 ArrayRef<Value *> VL) {
11020 assert(S && "Invalid state!");
11022 if (S.areInstructionsWithCopyableElements()) {
11023 MainOp = S.getMainOp();
11024 MainOpcode = S.getOpcode();
11025 Operands.assign(MainOp->getNumOperands(),
11026 BoUpSLP::ValueList(VL.size(), nullptr));
11027 for (auto [Idx, V] : enumerate(VL)) {
11028 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11029 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11030 Operands[OperandIdx][Idx] = Operand;
11031 }
11032 } else {
11033 buildOriginalOperands(S, VL, Operands);
11034 }
11035 return Operands;
11036 }
11037};
11038} // namespace
11039
11040BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11041 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11042 bool TryCopyableElementsVectorization) const {
11043 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11044
11045 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11046 InstructionsState S = Analysis.buildInstructionsState(
11047 VL, *this, TryCopyableElementsVectorization,
11048 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11049
11050 // Don't go into catchswitch blocks, which can happen with PHIs.
11051 // Such blocks can only have PHIs and the catchswitch. There is no
11052 // place to insert a shuffle if we need to, so just avoid that issue.
11053 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
11054 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11055 // Do not try to pack to avoid extra instructions here.
11056 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11057 /*TryToFindDuplicates=*/false);
11058 }
11059
11060 // Check if this is a duplicate of another entry.
11061 if (S) {
11062 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11063 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11064 if (E->isSame(VL)) {
11065 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11066 << ".\n");
11067 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11068 }
11069 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11070 if (all_of(VL, [&](Value *V) {
11071 return isa<PoisonValue>(V) || Values.contains(V) ||
11072 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11073 LI->getLoopFor(S.getMainOp()->getParent()) &&
11074 isVectorized(V));
11075 })) {
11076 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11077 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11078 }
11079 }
11080 }
11081
11082 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11083 // a load), in which case peek through to include it in the tree, without
11084 // ballooning over-budget.
11085 if (Depth >= RecursionMaxDepth &&
11086 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
11087 (match(S.getMainOp(), m_Load(m_Value())) ||
11088 all_of(VL, [&S](const Value *I) {
11089 return match(I,
11091 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11092 })))) {
11093 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11094 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11095 }
11096
11097 // Don't handle scalable vectors
11098 if (S && S.getOpcode() == Instruction::ExtractElement &&
11100 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11101 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11102 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11103 }
11104
11105 // Don't handle vectors.
11106 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11107 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11108 // Do not try to pack to avoid extra instructions here.
11109 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11110 /*TryToFindDuplicates=*/false);
11111 }
11112
11113 // If all of the operands are identical or constant we have a simple solution.
11114 // If we deal with insert/extract instructions, they all must have constant
11115 // indices, otherwise we should gather them, not try to vectorize.
11116 // If alternate op node with 2 elements with gathered operands - do not
11117 // vectorize.
11118 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11119 if (!S || !S.isAltShuffle() || VL.size() > 2)
11120 return false;
11121 if (VectorizableTree.size() < MinTreeSize)
11122 return false;
11123 if (Depth >= RecursionMaxDepth - 1)
11124 return true;
11125 // Check if all operands are extracts, part of vector node or can build a
11126 // regular vectorize node.
11127 SmallVector<unsigned, 8> InstsCount;
11128 for (Value *V : VL) {
11129 auto *I = cast<Instruction>(V);
11130 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11131 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11132 }));
11133 }
11134 bool IsCommutative =
11135 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11136 if ((IsCommutative &&
11137 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11138 (!IsCommutative &&
11139 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11140 return true;
11141 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11143 auto *I1 = cast<Instruction>(VL.front());
11144 auto *I2 = cast<Instruction>(VL.back());
11145 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11146 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11147 I2->getOperand(Op));
11148 if (static_cast<unsigned>(count_if(
11149 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11151 })) >= S.getMainOp()->getNumOperands() / 2)
11152 return false;
11153 if (S.getMainOp()->getNumOperands() > 2)
11154 return true;
11155 if (IsCommutative) {
11156 // Check permuted operands.
11157 Candidates.clear();
11158 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11159 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11160 I2->getOperand((Op + 1) % E));
11161 if (any_of(
11162 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11164 }))
11165 return false;
11166 }
11167 return true;
11168 };
11169 SmallVector<unsigned> SortedIndices;
11170 BasicBlock *BB = nullptr;
11171 bool IsScatterVectorizeUserTE =
11172 UserTreeIdx.UserTE &&
11173 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11174 bool AreAllSameBlock = S.valid();
11175 bool AreScatterAllGEPSameBlock =
11176 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11177 VL.size() > 2 &&
11178 all_of(VL,
11179 [&BB](Value *V) {
11180 auto *I = dyn_cast<GetElementPtrInst>(V);
11181 if (!I)
11182 return doesNotNeedToBeScheduled(V);
11183 if (!BB)
11184 BB = I->getParent();
11185 return BB == I->getParent() && I->getNumOperands() == 2;
11186 }) &&
11187 BB &&
11188 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11189 SortedIndices));
11190 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11191 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
11192 (S &&
11194 S.getMainOp()) &&
11196 NotProfitableForVectorization(VL)) {
11197 if (!S) {
11198 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11199 "C,S,B,O, small shuffle. \n";
11200 dbgs() << "[";
11201 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11202 dbgs() << "]\n");
11203 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11204 /*TryToFindDuplicates=*/true,
11205 /*TrySplitVectorize=*/true);
11206 }
11207 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11208 dbgs() << "[";
11209 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11210 dbgs() << "]\n");
11211 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11212 }
11213
11214 // Don't vectorize ephemeral values.
11215 if (S && !EphValues.empty()) {
11216 for (Value *V : VL) {
11217 if (EphValues.count(V)) {
11218 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11219 << ") is ephemeral.\n");
11220 // Do not try to pack to avoid extra instructions here.
11221 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11222 /*TryToFindDuplicates=*/false);
11223 }
11224 }
11225 }
11226
11227 // We now know that this is a vector of instructions of the same type from
11228 // the same block.
11229
11230 // Check that none of the instructions in the bundle are already in the tree
11231 // and the node may be not profitable for the vectorization as the small
11232 // alternate node.
11233 if (S && S.isAltShuffle()) {
11234 auto GetNumVectorizedExtracted = [&]() {
11235 APInt Extracted = APInt::getZero(VL.size());
11236 APInt Vectorized = APInt::getAllOnes(VL.size());
11237 for (auto [Idx, V] : enumerate(VL)) {
11238 auto *I = dyn_cast<Instruction>(V);
11239 if (!I || doesNotNeedToBeScheduled(I) ||
11240 all_of(I->operands(), [&](const Use &U) {
11241 return isa<ExtractElementInst>(U.get());
11242 }))
11243 continue;
11244 if (isVectorized(I))
11245 Vectorized.clearBit(Idx);
11246 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11247 Extracted.setBit(Idx);
11248 }
11249 return std::make_pair(Vectorized, Extracted);
11250 };
11251 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11253 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11254 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11255 // Rough cost estimation, if the vector code (+ potential extracts) is
11256 // more profitable than the scalar + buildvector.
11257 Type *ScalarTy = VL.front()->getType();
11258 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11259 InstructionCost VectorizeCostEstimate =
11260 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11261 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11262 /*Insert=*/false, /*Extract=*/true, Kind);
11263 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11264 *TTI, ScalarTy, VecTy, Vectorized,
11265 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11266 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11267 }
11268 if (PreferScalarize) {
11269 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11270 "node is not profitable.\n");
11271 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11272 }
11273 }
11274
11275 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11276 if (UserIgnoreList && !UserIgnoreList->empty()) {
11277 for (Value *V : VL) {
11278 if (UserIgnoreList->contains(V)) {
11279 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11280 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11281 }
11282 }
11283 }
11284
11285 // Special processing for sorted pointers for ScatterVectorize node with
11286 // constant indeces only.
11287 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11288 assert(VL.front()->getType()->isPointerTy() &&
11290 "Expected pointers only.");
11291 // Reset S to make it GetElementPtr kind of node.
11292 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11293 assert(It != VL.end() && "Expected at least one GEP.");
11294 S = getSameOpcode(*It, *TLI);
11295 }
11296
11297 // Check that all of the users of the scalars that we want to vectorize are
11298 // schedulable.
11299 Instruction *VL0 = S.getMainOp();
11300 BB = VL0->getParent();
11301
11302 if (S &&
11304 !DT->isReachableFromEntry(BB))) {
11305 // Don't go into unreachable blocks. They may contain instructions with
11306 // dependency cycles which confuse the final scheduling.
11307 // Do not vectorize EH and non-returning blocks, not profitable in most
11308 // cases.
11309 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11310 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11311 }
11312 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11313}
11314
11315void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11316 const EdgeInfo &UserTreeIdx,
11317 unsigned InterleaveFactor) {
11318 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11319
11320 SmallVector<int> ReuseShuffleIndices;
11321 SmallVector<Value *> VL(VLRef);
11322
11323 // Tries to build split node.
11324 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11325 SmallVector<Value *> Op1, Op2;
11326 OrdersType ReorderIndices;
11327 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11328 return false;
11329
11330 auto Invalid = ScheduleBundle::invalid();
11331 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11332 UserTreeIdx, {}, ReorderIndices);
11333 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11334 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11335 InstructionsState S = getSameOpcode(Op, *TLI);
11336 if (S && (isa<LoadInst>(S.getMainOp()) ||
11337 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11338 // Build gather node for loads, they will be gathered later.
11339 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11340 Idx == 0 ? 0 : Op1.size());
11341 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11342 } else {
11343 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11344 Idx == 0 ? 0 : Op1.size());
11345 buildTreeRec(Op, Depth, {TE, Idx});
11346 }
11347 };
11348 AddNode(Op1, 0);
11349 AddNode(Op2, 1);
11350 return true;
11351 };
11352
11353 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11354 bool AreConsts = false;
11355 for (Value *V : VL) {
11356 if (isa<PoisonValue>(V))
11357 continue;
11358 if (isa<Constant>(V)) {
11359 AreConsts = true;
11360 continue;
11361 }
11362 if (!isa<PHINode>(V))
11363 return false;
11364 }
11365 return AreConsts;
11366 };
11367 if (AreOnlyConstsWithPHIs(VL)) {
11368 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11369 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11370 return;
11371 }
11372
11373 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11374 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11375 InstructionsState S = Legality.getInstructionsState();
11376 if (!Legality.isLegal()) {
11377 if (Legality.trySplitVectorize()) {
11378 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11379 // Last chance to try to vectorize alternate node.
11380 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11381 return;
11382 }
11383 if (!S)
11384 Legality = getScalarsVectorizationLegality(
11385 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11386 if (!Legality.isLegal()) {
11387 if (Legality.tryToFindDuplicates())
11388 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11389 UserTreeIdx);
11390
11391 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11392 return;
11393 }
11394 S = Legality.getInstructionsState();
11395 }
11396
11397 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11398 if (S.isAltShuffle() && TrySplitNode(S))
11399 return;
11400
11401 // Check that every instruction appears once in this bundle.
11402 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11403 /*TryPad=*/true)) {
11404 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11405 return;
11406 }
11407
11408 // Perform specific checks for each particular instruction kind.
11409 bool IsScatterVectorizeUserTE =
11410 UserTreeIdx.UserTE &&
11411 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11412 OrdersType CurrentOrder;
11413 SmallVector<Value *> PointerOps;
11414 StridedPtrInfo SPtrInfo;
11415 TreeEntry::EntryState State = getScalarsVectorizationState(
11416 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11417 if (State == TreeEntry::NeedToGather) {
11418 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11419 return;
11420 }
11421
11422 Instruction *VL0 = S.getMainOp();
11423 BasicBlock *BB = VL0->getParent();
11424 auto &BSRef = BlocksSchedules[BB];
11425 if (!BSRef)
11426 BSRef = std::make_unique<BlockScheduling>(BB);
11427
11428 BlockScheduling &BS = *BSRef;
11429
11430 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11431 std::optional<ScheduleBundle *> BundlePtr =
11432 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11433#ifdef EXPENSIVE_CHECKS
11434 // Make sure we didn't break any internal invariants
11435 BS.verify();
11436#endif
11437 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11438 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11439 // Last chance to try to vectorize alternate node.
11440 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11441 return;
11442 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11443 NonScheduledFirst.insert(VL.front());
11444 if (S.getOpcode() == Instruction::Load &&
11445 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11447 return;
11448 }
11449 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11450 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11451 ScheduleBundle Empty;
11452 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11453 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11454
11455 unsigned ShuffleOrOp =
11456 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11457 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11458 // Postpone PHI nodes creation
11459 SmallVector<unsigned> PHIOps;
11460 for (unsigned I : seq<unsigned>(Operands.size())) {
11462 if (Op.empty())
11463 continue;
11464 InstructionsState S = getSameOpcode(Op, *TLI);
11465 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11466 buildTreeRec(Op, Depth + 1, {TE, I});
11467 else
11468 PHIOps.push_back(I);
11469 }
11470 for (unsigned I : PHIOps)
11471 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11472 };
11473 switch (ShuffleOrOp) {
11474 case Instruction::PHI: {
11475 TreeEntry *TE =
11476 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11477 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11478 TE->dump());
11479
11480 TE->setOperands(Operands);
11481 CreateOperandNodes(TE, Operands);
11482 return;
11483 }
11484 case Instruction::ExtractValue:
11485 case Instruction::ExtractElement: {
11486 if (CurrentOrder.empty()) {
11487 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11488 } else {
11489 LLVM_DEBUG({
11490 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11491 "with order";
11492 for (unsigned Idx : CurrentOrder)
11493 dbgs() << " " << Idx;
11494 dbgs() << "\n";
11495 });
11496 fixupOrderingIndices(CurrentOrder);
11497 }
11498 // Insert new order with initial value 0, if it does not exist,
11499 // otherwise return the iterator to the existing one.
11500 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11501 ReuseShuffleIndices, CurrentOrder);
11502 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11503 "(ExtractValueInst/ExtractElementInst).\n";
11504 TE->dump());
11505 // This is a special case, as it does not gather, but at the same time
11506 // we are not extending buildTreeRec() towards the operands.
11507 TE->setOperands(Operands);
11508 return;
11509 }
11510 case Instruction::InsertElement: {
11511 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11512
11513 auto OrdCompare = [](const std::pair<int, int> &P1,
11514 const std::pair<int, int> &P2) {
11515 return P1.first > P2.first;
11516 };
11517 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
11518 decltype(OrdCompare)>
11519 Indices(OrdCompare);
11520 for (int I = 0, E = VL.size(); I < E; ++I) {
11521 unsigned Idx = *getElementIndex(VL[I]);
11522 Indices.emplace(Idx, I);
11523 }
11524 OrdersType CurrentOrder(VL.size(), VL.size());
11525 bool IsIdentity = true;
11526 for (int I = 0, E = VL.size(); I < E; ++I) {
11527 CurrentOrder[Indices.top().second] = I;
11528 IsIdentity &= Indices.top().second == I;
11529 Indices.pop();
11530 }
11531 if (IsIdentity)
11532 CurrentOrder.clear();
11533 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11534 {}, CurrentOrder);
11535 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
11536 TE->dump());
11537
11538 TE->setOperands(Operands);
11539 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
11540 return;
11541 }
11542 case Instruction::Load: {
11543 // Check that a vectorized load would load the same memory as a scalar
11544 // load. For example, we don't want to vectorize loads that are smaller
11545 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
11546 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
11547 // from such a struct, we read/write packed bits disagreeing with the
11548 // unvectorized version.
11549 TreeEntry *TE = nullptr;
11550 fixupOrderingIndices(CurrentOrder);
11551 switch (State) {
11552 case TreeEntry::Vectorize:
11553 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11554 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11555 if (CurrentOrder.empty())
11556 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
11557 TE->dump());
11558 else
11560 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
11561 TE->dump());
11562 break;
11563 case TreeEntry::CompressVectorize:
11564 // Vectorizing non-consecutive loads with (masked)load + compress.
11565 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11566 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11567 LLVM_DEBUG(
11568 dbgs()
11569 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11570 TE->dump());
11571 break;
11572 case TreeEntry::StridedVectorize:
11573 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11574 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11575 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11576 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
11577 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11578 TE->dump());
11579 break;
11580 case TreeEntry::ScatterVectorize:
11581 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11582 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11583 UserTreeIdx, ReuseShuffleIndices);
11584 LLVM_DEBUG(
11585 dbgs()
11586 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11587 TE->dump());
11588 break;
11589 case TreeEntry::CombinedVectorize:
11590 case TreeEntry::SplitVectorize:
11591 case TreeEntry::NeedToGather:
11592 llvm_unreachable("Unexpected loads state.");
11593 }
11594 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11595 assert(Operands.size() == 1 && "Expected a single operand only");
11596 SmallVector<int> Mask;
11597 inversePermutation(CurrentOrder, Mask);
11598 reorderScalars(Operands.front(), Mask);
11599 }
11600 TE->setOperands(Operands);
11601 if (State == TreeEntry::ScatterVectorize)
11602 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
11603 return;
11604 }
11605 case Instruction::ZExt:
11606 case Instruction::SExt:
11607 case Instruction::FPToUI:
11608 case Instruction::FPToSI:
11609 case Instruction::FPExt:
11610 case Instruction::PtrToInt:
11611 case Instruction::IntToPtr:
11612 case Instruction::SIToFP:
11613 case Instruction::UIToFP:
11614 case Instruction::Trunc:
11615 case Instruction::FPTrunc:
11616 case Instruction::BitCast: {
11617 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11618 std::make_pair(std::numeric_limits<unsigned>::min(),
11619 std::numeric_limits<unsigned>::max()));
11620 if (ShuffleOrOp == Instruction::ZExt ||
11621 ShuffleOrOp == Instruction::SExt) {
11622 CastMaxMinBWSizes = std::make_pair(
11623 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11624 PrevMaxBW),
11625 std::min<unsigned>(
11626 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11627 PrevMinBW));
11628 } else if (ShuffleOrOp == Instruction::Trunc) {
11629 CastMaxMinBWSizes = std::make_pair(
11630 std::max<unsigned>(
11631 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11632 PrevMaxBW),
11633 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11634 PrevMinBW));
11635 }
11636 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11637 ReuseShuffleIndices);
11638 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
11639 TE->dump());
11640
11641 TE->setOperands(Operands);
11642 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11643 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11644 if (ShuffleOrOp == Instruction::Trunc) {
11645 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11646 } else if (ShuffleOrOp == Instruction::SIToFP ||
11647 ShuffleOrOp == Instruction::UIToFP) {
11648 unsigned NumSignBits =
11649 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11650 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
11651 APInt Mask = DB->getDemandedBits(OpI);
11652 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
11653 }
11654 if (NumSignBits * 2 >=
11655 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11656 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11657 }
11658 return;
11659 }
11660 case Instruction::ICmp:
11661 case Instruction::FCmp: {
11662 // Check that all of the compares have the same predicate.
11663 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11664 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11665 ReuseShuffleIndices);
11666 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
11667 TE->dump());
11668
11669 VLOperands Ops(VL, Operands, S, *this);
11670 if (cast<CmpInst>(VL0)->isCommutative()) {
11671 // Commutative predicate - collect + sort operands of the instructions
11672 // so that each side is more likely to have the same opcode.
11674 "Commutative Predicate mismatch");
11675 Ops.reorder();
11676 Operands.front() = Ops.getVL(0);
11677 Operands.back() = Ops.getVL(1);
11678 } else {
11679 // Collect operands - commute if it uses the swapped predicate.
11680 for (auto [Idx, V] : enumerate(VL)) {
11681 if (isa<PoisonValue>(V))
11682 continue;
11683 auto *Cmp = cast<CmpInst>(V);
11684 if (Cmp->getPredicate() != P0)
11685 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11686 }
11687 }
11688 TE->setOperands(Operands);
11689 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11690 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11691 if (ShuffleOrOp == Instruction::ICmp) {
11692 unsigned NumSignBits0 =
11693 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11694 if (NumSignBits0 * 2 >=
11695 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11696 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11697 unsigned NumSignBits1 =
11698 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
11699 if (NumSignBits1 * 2 >=
11700 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
11701 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11702 }
11703 return;
11704 }
11705 case Instruction::Select:
11706 case Instruction::FNeg:
11707 case Instruction::Add:
11708 case Instruction::FAdd:
11709 case Instruction::Sub:
11710 case Instruction::FSub:
11711 case Instruction::Mul:
11712 case Instruction::FMul:
11713 case Instruction::UDiv:
11714 case Instruction::SDiv:
11715 case Instruction::FDiv:
11716 case Instruction::URem:
11717 case Instruction::SRem:
11718 case Instruction::FRem:
11719 case Instruction::Shl:
11720 case Instruction::LShr:
11721 case Instruction::AShr:
11722 case Instruction::And:
11723 case Instruction::Or:
11724 case Instruction::Xor:
11725 case Instruction::Freeze: {
11726 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11727 ReuseShuffleIndices);
11728 LLVM_DEBUG(
11729 dbgs() << "SLP: added a new TreeEntry "
11730 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11731 TE->dump());
11732
11733 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
11734 VLOperands Ops(VL, Operands, S, *this);
11735 Ops.reorder();
11736 Operands[0] = Ops.getVL(0);
11737 Operands[1] = Ops.getVL(1);
11738 }
11739 TE->setOperands(Operands);
11740 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11741 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11742 return;
11743 }
11744 case Instruction::GetElementPtr: {
11745 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11746 ReuseShuffleIndices);
11747 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
11748 TE->dump());
11749 TE->setOperands(Operands);
11750
11751 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
11752 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11753 return;
11754 }
11755 case Instruction::Store: {
11756 bool Consecutive = CurrentOrder.empty();
11757 if (!Consecutive)
11758 fixupOrderingIndices(CurrentOrder);
11759 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11760 ReuseShuffleIndices, CurrentOrder);
11761 if (Consecutive)
11762 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
11763 TE->dump());
11764 else
11765 LLVM_DEBUG(
11766 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
11767 TE->dump());
11768 TE->setOperands(Operands);
11769 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
11770 return;
11771 }
11772 case Instruction::Call: {
11773 // Check if the calls are all to the same vectorizable intrinsic or
11774 // library function.
11775 CallInst *CI = cast<CallInst>(VL0);
11777
11778 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11779 ReuseShuffleIndices);
11780 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
11781 TE->dump());
11782 if (isCommutative(VL0)) {
11783 VLOperands Ops(VL, Operands, S, *this);
11784 Ops.reorder();
11785 Operands[0] = Ops.getVL(0);
11786 Operands[1] = Ops.getVL(1);
11787 }
11788 TE->setOperands(Operands);
11789 for (unsigned I : seq<unsigned>(CI->arg_size())) {
11790 // For scalar operands no need to create an entry since no need to
11791 // vectorize it.
11793 continue;
11794 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11795 }
11796 return;
11797 }
11798 case Instruction::ShuffleVector: {
11799 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11800 ReuseShuffleIndices);
11801 if (S.isAltShuffle()) {
11802 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
11803 TE->dump());
11804 } else {
11805 assert(SLPReVec && "Only supported by REVEC.");
11806 LLVM_DEBUG(
11807 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11808 TE->dump());
11809 }
11810
11811 // Reorder operands if reordering would enable vectorization.
11812 auto *CI = dyn_cast<CmpInst>(VL0);
11813 if (CI && any_of(VL, [](Value *V) {
11814 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
11815 })) {
11816 auto *MainCI = cast<CmpInst>(S.getMainOp());
11817 auto *AltCI = cast<CmpInst>(S.getAltOp());
11818 CmpInst::Predicate MainP = MainCI->getPredicate();
11819 CmpInst::Predicate AltP = AltCI->getPredicate();
11820 assert(MainP != AltP &&
11821 "Expected different main/alternate predicates.");
11822 // Collect operands - commute if it uses the swapped predicate or
11823 // alternate operation.
11824 for (auto [Idx, V] : enumerate(VL)) {
11825 if (isa<PoisonValue>(V))
11826 continue;
11827 auto *Cmp = cast<CmpInst>(V);
11828
11829 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
11830 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11831 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11832 } else {
11833 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11834 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11835 }
11836 }
11837 TE->setOperands(Operands);
11838 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11839 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11840 return;
11841 }
11842
11843 if (isa<BinaryOperator>(VL0) || CI) {
11844 VLOperands Ops(VL, Operands, S, *this);
11845 Ops.reorder();
11846 Operands[0] = Ops.getVL(0);
11847 Operands[1] = Ops.getVL(1);
11848 }
11849 TE->setOperands(Operands);
11850 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11851 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11852 return;
11853 }
11854 default:
11855 break;
11856 }
11857 llvm_unreachable("Unexpected vectorization of the instructions.");
11858}
11859
11861 unsigned N = 1;
11862 Type *EltTy = T;
11863
11865 if (EltTy->isEmptyTy())
11866 return 0;
11867 if (auto *ST = dyn_cast<StructType>(EltTy)) {
11868 // Check that struct is homogeneous.
11869 for (const auto *Ty : ST->elements())
11870 if (Ty != *ST->element_begin())
11871 return 0;
11872 N *= ST->getNumElements();
11873 EltTy = *ST->element_begin();
11874 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
11875 N *= AT->getNumElements();
11876 EltTy = AT->getElementType();
11877 } else {
11878 auto *VT = cast<FixedVectorType>(EltTy);
11879 N *= VT->getNumElements();
11880 EltTy = VT->getElementType();
11881 }
11882 }
11883
11884 if (!isValidElementType(EltTy))
11885 return 0;
11886 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
11887 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
11888 VTSize != DL->getTypeStoreSizeInBits(T))
11889 return 0;
11890 return N;
11891}
11892
11893bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
11894 SmallVectorImpl<unsigned> &CurrentOrder,
11895 bool ResizeAllowed) const {
11897 assert(It != VL.end() && "Expected at least one extract instruction.");
11898 auto *E0 = cast<Instruction>(*It);
11899 assert(
11901 "Invalid opcode");
11902 // Check if all of the extracts come from the same vector and from the
11903 // correct offset.
11904 Value *Vec = E0->getOperand(0);
11905
11906 CurrentOrder.clear();
11907
11908 // We have to extract from a vector/aggregate with the same number of elements.
11909 unsigned NElts;
11910 if (E0->getOpcode() == Instruction::ExtractValue) {
11911 NElts = canMapToVector(Vec->getType());
11912 if (!NElts)
11913 return false;
11914 // Check if load can be rewritten as load of vector.
11915 LoadInst *LI = dyn_cast<LoadInst>(Vec);
11916 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
11917 return false;
11918 } else {
11919 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
11920 }
11921
11922 unsigned E = VL.size();
11923 if (!ResizeAllowed && NElts != E)
11924 return false;
11926 unsigned MinIdx = NElts, MaxIdx = 0;
11927 for (auto [I, V] : enumerate(VL)) {
11928 auto *Inst = dyn_cast<Instruction>(V);
11929 if (!Inst)
11930 continue;
11931 if (Inst->getOperand(0) != Vec)
11932 return false;
11933 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
11934 if (isa<UndefValue>(EE->getIndexOperand()))
11935 continue;
11936 std::optional<unsigned> Idx = getExtractIndex(Inst);
11937 if (!Idx)
11938 return false;
11939 const unsigned ExtIdx = *Idx;
11940 if (ExtIdx >= NElts)
11941 continue;
11942 Indices[I] = ExtIdx;
11943 if (MinIdx > ExtIdx)
11944 MinIdx = ExtIdx;
11945 if (MaxIdx < ExtIdx)
11946 MaxIdx = ExtIdx;
11947 }
11948 if (MaxIdx - MinIdx + 1 > E)
11949 return false;
11950 if (MaxIdx + 1 <= E)
11951 MinIdx = 0;
11952
11953 // Check that all of the indices extract from the correct offset.
11954 bool ShouldKeepOrder = true;
11955 // Assign to all items the initial value E + 1 so we can check if the extract
11956 // instruction index was used already.
11957 // Also, later we can check that all the indices are used and we have a
11958 // consecutive access in the extract instructions, by checking that no
11959 // element of CurrentOrder still has value E + 1.
11960 CurrentOrder.assign(E, E);
11961 for (unsigned I = 0; I < E; ++I) {
11962 if (Indices[I] == PoisonMaskElem)
11963 continue;
11964 const unsigned ExtIdx = Indices[I] - MinIdx;
11965 if (CurrentOrder[ExtIdx] != E) {
11966 CurrentOrder.clear();
11967 return false;
11968 }
11969 ShouldKeepOrder &= ExtIdx == I;
11970 CurrentOrder[ExtIdx] = I;
11971 }
11972 if (ShouldKeepOrder)
11973 CurrentOrder.clear();
11974
11975 return ShouldKeepOrder;
11976}
11977
11978bool BoUpSLP::areAllUsersVectorized(
11979 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
11980 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
11981 all_of(I->users(), [this](User *U) {
11982 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
11983 (isa<ExtractElementInst>(U) && MustGather.contains(U));
11984 });
11985}
11986
11987void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
11988 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
11989 SmallVectorImpl<Value *> *OpScalars,
11990 SmallVectorImpl<Value *> *AltScalars) const {
11991 unsigned Sz = Scalars.size();
11992 Mask.assign(Sz, PoisonMaskElem);
11993 SmallVector<int> OrderMask;
11994 if (!ReorderIndices.empty())
11995 inversePermutation(ReorderIndices, OrderMask);
11996 for (unsigned I = 0; I < Sz; ++I) {
11997 unsigned Idx = I;
11998 if (!ReorderIndices.empty())
11999 Idx = OrderMask[I];
12000 if (isa<PoisonValue>(Scalars[Idx]))
12001 continue;
12002 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12003 if (IsAltOp(OpInst)) {
12004 Mask[I] = Sz + Idx;
12005 if (AltScalars)
12006 AltScalars->push_back(OpInst);
12007 } else {
12008 Mask[I] = Idx;
12009 if (OpScalars)
12010 OpScalars->push_back(OpInst);
12011 }
12012 }
12013 if (!ReuseShuffleIndices.empty()) {
12014 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12015 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12016 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12017 });
12018 Mask.swap(NewMask);
12019 }
12020}
12021
12023 Instruction *AltOp,
12024 const TargetLibraryInfo &TLI) {
12025 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12026}
12027
12029 Instruction *AltOp,
12030 const TargetLibraryInfo &TLI) {
12031 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12032 auto *AltCI = cast<CmpInst>(AltOp);
12033 CmpInst::Predicate MainP = MainCI->getPredicate();
12034 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12035 assert(MainP != AltP && "Expected different main/alternate predicates.");
12036 auto *CI = cast<CmpInst>(I);
12037 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12038 return false;
12039 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12040 return true;
12041 CmpInst::Predicate P = CI->getPredicate();
12043
12044 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12045 "CmpInst expected to match either main or alternate predicate or "
12046 "their swap.");
12047 return MainP != P && MainP != SwappedP;
12048 }
12049 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12050}
12051
12052TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12053 assert(!Ops.empty());
12054 const auto *Op0 = Ops.front();
12055
12056 const bool IsConstant = all_of(Ops, [](Value *V) {
12057 // TODO: We should allow undef elements here
12058 return isConstant(V) && !isa<UndefValue>(V);
12059 });
12060 const bool IsUniform = all_of(Ops, [=](Value *V) {
12061 // TODO: We should allow undef elements here
12062 return V == Op0;
12063 });
12064 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12065 // TODO: We should allow undef elements here
12066 if (auto *CI = dyn_cast<ConstantInt>(V))
12067 return CI->getValue().isPowerOf2();
12068 return false;
12069 });
12070 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12071 // TODO: We should allow undef elements here
12072 if (auto *CI = dyn_cast<ConstantInt>(V))
12073 return CI->getValue().isNegatedPowerOf2();
12074 return false;
12075 });
12076
12078 if (IsConstant && IsUniform)
12080 else if (IsConstant)
12082 else if (IsUniform)
12084
12086 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12087 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12088
12089 return {VK, VP};
12090}
12091
12092namespace {
12093/// The base class for shuffle instruction emission and shuffle cost estimation.
12094class BaseShuffleAnalysis {
12095protected:
12096 Type *ScalarTy = nullptr;
12097
12098 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12099
12100 /// V is expected to be a vectorized value.
12101 /// When REVEC is disabled, there is no difference between VF and
12102 /// VNumElements.
12103 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12104 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12105 /// of 8.
12106 unsigned getVF(Value *V) const {
12107 assert(V && "V cannot be nullptr");
12108 assert(isa<FixedVectorType>(V->getType()) &&
12109 "V does not have FixedVectorType");
12110 assert(ScalarTy && "ScalarTy cannot be nullptr");
12111 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12112 unsigned VNumElements =
12113 cast<FixedVectorType>(V->getType())->getNumElements();
12114 assert(VNumElements > ScalarTyNumElements &&
12115 "the number of elements of V is not large enough");
12116 assert(VNumElements % ScalarTyNumElements == 0 &&
12117 "the number of elements of V is not a vectorized value");
12118 return VNumElements / ScalarTyNumElements;
12119 }
12120
12121 /// Checks if the mask is an identity mask.
12122 /// \param IsStrict if is true the function returns false if mask size does
12123 /// not match vector size.
12124 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12125 bool IsStrict) {
12126 int Limit = Mask.size();
12127 int VF = VecTy->getNumElements();
12128 int Index = -1;
12129 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12130 return true;
12131 if (!IsStrict) {
12132 // Consider extract subvector starting from index 0.
12133 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12134 Index == 0)
12135 return true;
12136 // All VF-size submasks are identity (e.g.
12137 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12138 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12139 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12140 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
12142 }))
12143 return true;
12144 }
12145 return false;
12146 }
12147
12148 /// Tries to combine 2 different masks into single one.
12149 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12150 /// change the size of the vector, \p LocalVF is the original size of the
12151 /// shuffled vector.
12152 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12153 ArrayRef<int> ExtMask) {
12154 unsigned VF = Mask.size();
12155 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12156 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12157 if (ExtMask[I] == PoisonMaskElem)
12158 continue;
12159 int MaskedIdx = Mask[ExtMask[I] % VF];
12160 NewMask[I] =
12161 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12162 }
12163 Mask.swap(NewMask);
12164 }
12165
12166 /// Looks through shuffles trying to reduce final number of shuffles in the
12167 /// code. The function looks through the previously emitted shuffle
12168 /// instructions and properly mark indices in mask as undef.
12169 /// For example, given the code
12170 /// \code
12171 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12172 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12173 /// \endcode
12174 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12175 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12176 /// <0, 1, 2, 3> for the shuffle.
12177 /// If 2 operands are of different size, the smallest one will be resized and
12178 /// the mask recalculated properly.
12179 /// For example, given the code
12180 /// \code
12181 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12182 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12183 /// \endcode
12184 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12185 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12186 /// <0, 1, 2, 3> for the shuffle.
12187 /// So, it tries to transform permutations to simple vector merge, if
12188 /// possible.
12189 /// \param V The input vector which must be shuffled using the given \p Mask.
12190 /// If the better candidate is found, \p V is set to this best candidate
12191 /// vector.
12192 /// \param Mask The input mask for the shuffle. If the best candidate is found
12193 /// during looking-through-shuffles attempt, it is updated accordingly.
12194 /// \param SinglePermute true if the shuffle operation is originally a
12195 /// single-value-permutation. In this case the look-through-shuffles procedure
12196 /// may look for resizing shuffles as the best candidates.
12197 /// \return true if the shuffle results in the non-resizing identity shuffle
12198 /// (and thus can be ignored), false - otherwise.
12199 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12200 bool SinglePermute) {
12201 Value *Op = V;
12202 ShuffleVectorInst *IdentityOp = nullptr;
12203 SmallVector<int> IdentityMask;
12204 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12205 // Exit if not a fixed vector type or changing size shuffle.
12206 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12207 if (!SVTy)
12208 break;
12209 // Remember the identity or broadcast mask, if it is not a resizing
12210 // shuffle. If no better candidates are found, this Op and Mask will be
12211 // used in the final shuffle.
12212 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12213 if (!IdentityOp || !SinglePermute ||
12214 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12216 IdentityMask.size()))) {
12217 IdentityOp = SV;
12218 // Store current mask in the IdentityMask so later we did not lost
12219 // this info if IdentityOp is selected as the best candidate for the
12220 // permutation.
12221 IdentityMask.assign(Mask);
12222 }
12223 }
12224 // Remember the broadcast mask. If no better candidates are found, this Op
12225 // and Mask will be used in the final shuffle.
12226 // Zero splat can be used as identity too, since it might be used with
12227 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12228 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12229 // expensive, the analysis founds out, that the source vector is just a
12230 // broadcast, this original mask can be transformed to identity mask <0,
12231 // 1, 2, 3>.
12232 // \code
12233 // %0 = shuffle %v, poison, zeroinitalizer
12234 // %res = shuffle %0, poison, <3, 1, 2, 0>
12235 // \endcode
12236 // may be transformed to
12237 // \code
12238 // %0 = shuffle %v, poison, zeroinitalizer
12239 // %res = shuffle %0, poison, <0, 1, 2, 3>
12240 // \endcode
12241 if (SV->isZeroEltSplat()) {
12242 IdentityOp = SV;
12243 IdentityMask.assign(Mask);
12244 }
12245 int LocalVF = Mask.size();
12246 if (auto *SVOpTy =
12247 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12248 LocalVF = SVOpTy->getNumElements();
12249 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12250 for (auto [Idx, I] : enumerate(Mask)) {
12251 if (I == PoisonMaskElem ||
12252 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12253 continue;
12254 ExtMask[Idx] = SV->getMaskValue(I);
12255 }
12256 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12257 SV->getOperand(0),
12258 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12259 .all();
12260 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12261 SV->getOperand(1),
12262 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12263 .all();
12264 if (!IsOp1Undef && !IsOp2Undef) {
12265 // Update mask and mark undef elems.
12266 for (int &I : Mask) {
12267 if (I == PoisonMaskElem)
12268 continue;
12269 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12271 I = PoisonMaskElem;
12272 }
12273 break;
12274 }
12275 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12276 combineMasks(LocalVF, ShuffleMask, Mask);
12277 Mask.swap(ShuffleMask);
12278 if (IsOp2Undef)
12279 Op = SV->getOperand(0);
12280 else
12281 Op = SV->getOperand(1);
12282 }
12283 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12284 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12286 if (IdentityOp) {
12287 V = IdentityOp;
12288 assert(Mask.size() == IdentityMask.size() &&
12289 "Expected masks of same sizes.");
12290 // Clear known poison elements.
12291 for (auto [I, Idx] : enumerate(Mask))
12292 if (Idx == PoisonMaskElem)
12293 IdentityMask[I] = PoisonMaskElem;
12294 Mask.swap(IdentityMask);
12295 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12296 return SinglePermute &&
12297 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12298 /*IsStrict=*/true) ||
12299 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12300 Shuffle->isZeroEltSplat() &&
12302 all_of(enumerate(Mask), [&](const auto &P) {
12303 return P.value() == PoisonMaskElem ||
12304 Shuffle->getShuffleMask()[P.index()] == 0;
12305 })));
12306 }
12307 V = Op;
12308 return false;
12309 }
12310 V = Op;
12311 return true;
12312 }
12313
12314 /// Smart shuffle instruction emission, walks through shuffles trees and
12315 /// tries to find the best matching vector for the actual shuffle
12316 /// instruction.
12317 template <typename T, typename ShuffleBuilderTy>
12318 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12319 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12320 assert(V1 && "Expected at least one vector value.");
12321 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12322 SmallVector<int> NewMask(Mask);
12323 if (ScalarTyNumElements != 1) {
12324 assert(SLPReVec && "FixedVectorType is not expected.");
12325 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12326 Mask = NewMask;
12327 }
12328 if (V2)
12329 Builder.resizeToMatch(V1, V2);
12330 int VF = Mask.size();
12331 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12332 VF = FTy->getNumElements();
12334 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12335 .all()) {
12336 // Peek through shuffles.
12337 Value *Op1 = V1;
12338 Value *Op2 = V2;
12339 int VF =
12340 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12341 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12342 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12343 for (int I = 0, E = Mask.size(); I < E; ++I) {
12344 if (Mask[I] < VF)
12345 CombinedMask1[I] = Mask[I];
12346 else
12347 CombinedMask2[I] = Mask[I] - VF;
12348 }
12349 Value *PrevOp1;
12350 Value *PrevOp2;
12351 do {
12352 PrevOp1 = Op1;
12353 PrevOp2 = Op2;
12354 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12355 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12356 // Check if we have 2 resizing shuffles - need to peek through operands
12357 // again.
12358 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12359 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12360 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12361 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12362 if (I == PoisonMaskElem)
12363 continue;
12364 ExtMask1[Idx] = SV1->getMaskValue(I);
12365 }
12366 SmallBitVector UseMask1 = buildUseMask(
12367 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12368 ->getNumElements(),
12369 ExtMask1, UseMask::SecondArg);
12370 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12371 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12372 if (I == PoisonMaskElem)
12373 continue;
12374 ExtMask2[Idx] = SV2->getMaskValue(I);
12375 }
12376 SmallBitVector UseMask2 = buildUseMask(
12377 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12378 ->getNumElements(),
12379 ExtMask2, UseMask::SecondArg);
12380 if (SV1->getOperand(0)->getType() ==
12381 SV2->getOperand(0)->getType() &&
12382 SV1->getOperand(0)->getType() != SV1->getType() &&
12383 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12384 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12385 Op1 = SV1->getOperand(0);
12386 Op2 = SV2->getOperand(0);
12387 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12388 int LocalVF = ShuffleMask1.size();
12389 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12390 LocalVF = FTy->getNumElements();
12391 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12392 CombinedMask1.swap(ShuffleMask1);
12393 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12394 LocalVF = ShuffleMask2.size();
12395 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12396 LocalVF = FTy->getNumElements();
12397 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12398 CombinedMask2.swap(ShuffleMask2);
12399 }
12400 }
12401 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12402 Builder.resizeToMatch(Op1, Op2);
12403 VF = std::max(cast<VectorType>(Op1->getType())
12404 ->getElementCount()
12405 .getKnownMinValue(),
12407 ->getElementCount()
12408 .getKnownMinValue());
12409 for (int I = 0, E = Mask.size(); I < E; ++I) {
12410 if (CombinedMask2[I] != PoisonMaskElem) {
12411 assert(CombinedMask1[I] == PoisonMaskElem &&
12412 "Expected undefined mask element");
12413 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12414 }
12415 }
12416 if (Op1 == Op2 &&
12417 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12418 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12420 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12421 ArrayRef(CombinedMask1))))
12422 return Builder.createIdentity(Op1);
12423 return Builder.createShuffleVector(
12424 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12425 CombinedMask1);
12426 }
12427 if (isa<PoisonValue>(V1))
12428 return Builder.createPoison(
12429 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12430 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12431 assert(V1 && "Expected non-null value after looking through shuffles.");
12432
12433 if (!IsIdentity)
12434 return Builder.createShuffleVector(V1, NewMask);
12435 return Builder.createIdentity(V1);
12436 }
12437
12438 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12439 /// shuffle emission.
12440 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12441 ArrayRef<int> Mask) {
12442 for (unsigned I : seq<unsigned>(CommonMask.size()))
12443 if (Mask[I] != PoisonMaskElem)
12444 CommonMask[I] = I;
12445 }
12446};
12447} // namespace
12448
12449/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12450static std::pair<InstructionCost, InstructionCost>
12452 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12453 Type *ScalarTy, VectorType *VecTy) {
12454 InstructionCost ScalarCost = 0;
12455 InstructionCost VecCost = 0;
12456 // Here we differentiate two cases: (1) when Ptrs represent a regular
12457 // vectorization tree node (as they are pointer arguments of scattered
12458 // loads) or (2) when Ptrs are the arguments of loads or stores being
12459 // vectorized as plane wide unit-stride load/store since all the
12460 // loads/stores are known to be from/to adjacent locations.
12461 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12462 // Case 2: estimate costs for pointer related costs when vectorizing to
12463 // a wide load/store.
12464 // Scalar cost is estimated as a set of pointers with known relationship
12465 // between them.
12466 // For vector code we will use BasePtr as argument for the wide load/store
12467 // but we also need to account all the instructions which are going to
12468 // stay in vectorized code due to uses outside of these scalar
12469 // loads/stores.
12470 ScalarCost = TTI.getPointersChainCost(
12471 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12472 CostKind);
12473
12474 SmallVector<const Value *> PtrsRetainedInVecCode;
12475 for (Value *V : Ptrs) {
12476 if (V == BasePtr) {
12477 PtrsRetainedInVecCode.push_back(V);
12478 continue;
12479 }
12481 // For simplicity assume Ptr to stay in vectorized code if it's not a
12482 // GEP instruction. We don't care since it's cost considered free.
12483 // TODO: We should check for any uses outside of vectorizable tree
12484 // rather than just single use.
12485 if (!Ptr || !Ptr->hasOneUse())
12486 PtrsRetainedInVecCode.push_back(V);
12487 }
12488
12489 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12490 // If all pointers stay in vectorized code then we don't have
12491 // any savings on that.
12492 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
12493 }
12494 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12495 TTI::PointersChainInfo::getKnownStride(),
12496 VecTy, CostKind);
12497 } else {
12498 // Case 1: Ptrs are the arguments of loads that we are going to transform
12499 // into masked gather load intrinsic.
12500 // All the scalar GEPs will be removed as a result of vectorization.
12501 // For any external uses of some lanes extract element instructions will
12502 // be generated (which cost is estimated separately).
12503 TTI::PointersChainInfo PtrsInfo =
12504 all_of(Ptrs,
12505 [](const Value *V) {
12507 return Ptr && !Ptr->hasAllConstantIndices();
12508 })
12509 ? TTI::PointersChainInfo::getUnknownStride()
12510 : TTI::PointersChainInfo::getKnownStride();
12511
12512 ScalarCost =
12513 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
12514 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12515 if (!BaseGEP) {
12516 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
12517 if (It != Ptrs.end())
12518 BaseGEP = cast<GEPOperator>(*It);
12519 }
12520 if (BaseGEP) {
12521 SmallVector<const Value *> Indices(BaseGEP->indices());
12522 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
12523 BaseGEP->getPointerOperand(), Indices, VecTy,
12524 CostKind);
12525 }
12526 }
12527
12528 return std::make_pair(ScalarCost, VecCost);
12529}
12530
12531void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12532 assert(TE.isGather() && TE.ReorderIndices.empty() &&
12533 "Expected gather node without reordering.");
12534 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
12535 SmallSet<size_t, 2> LoadKeyUsed;
12536
12537 // Do not reorder nodes if it small (just 2 elements), all-constant or all
12538 // instructions have same opcode already.
12539 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
12540 all_of(TE.Scalars, isConstant))
12541 return;
12542
12543 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
12544 return VectorizableTree[Idx]->isSame(TE.Scalars);
12545 }))
12546 return;
12547
12548 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
12549 Key = hash_combine(hash_value(LI->getParent()), Key);
12550 Value *Ptr =
12551 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
12552 if (LoadKeyUsed.contains(Key)) {
12553 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
12554 if (LIt != LoadsMap.end()) {
12555 for (LoadInst *RLI : LIt->second) {
12556 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
12557 LI->getType(), LI->getPointerOperand(), *DL, *SE,
12558 /*StrictCheck=*/true))
12559 return hash_value(RLI->getPointerOperand());
12560 }
12561 for (LoadInst *RLI : LIt->second) {
12563 LI->getPointerOperand(), *TLI)) {
12564 hash_code SubKey = hash_value(RLI->getPointerOperand());
12565 return SubKey;
12566 }
12567 }
12568 if (LIt->second.size() > 2) {
12569 hash_code SubKey =
12570 hash_value(LIt->second.back()->getPointerOperand());
12571 return SubKey;
12572 }
12573 }
12574 }
12575 LoadKeyUsed.insert(Key);
12576 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
12577 return hash_value(LI->getPointerOperand());
12578 };
12579 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12580 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12581 bool IsOrdered = true;
12582 unsigned NumInstructions = 0;
12583 // Try to "cluster" scalar instructions, to be able to build extra vectorized
12584 // nodes.
12585 for (auto [I, V] : enumerate(TE.Scalars)) {
12586 size_t Key = 1, Idx = 1;
12587 if (auto *Inst = dyn_cast<Instruction>(V);
12589 !isDeleted(Inst) && !isVectorized(V)) {
12590 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
12591 /*AllowAlternate=*/false);
12592 ++NumInstructions;
12593 }
12594 auto &Container = SortedValues[Key];
12595 if (IsOrdered && !KeyToIndex.contains(V) &&
12598 ((Container.contains(Idx) &&
12599 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
12600 (!Container.empty() && !Container.contains(Idx) &&
12601 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
12602 IsOrdered = false;
12603 auto &KTI = KeyToIndex[V];
12604 if (KTI.empty())
12605 Container[Idx].push_back(V);
12606 KTI.push_back(I);
12607 }
12609 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12610 if (!IsOrdered && NumInstructions > 1) {
12611 unsigned Cnt = 0;
12612 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
12613 for (const auto &D : SortedValues) {
12614 for (const auto &P : D.second) {
12615 unsigned Sz = 0;
12616 for (Value *V : P.second) {
12617 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
12618 for (auto [K, Idx] : enumerate(Indices)) {
12619 TE.ReorderIndices[Cnt + K] = Idx;
12620 TE.Scalars[Cnt + K] = V;
12621 }
12622 Sz += Indices.size();
12623 Cnt += Indices.size();
12624 }
12625 if (Sz > 1 && isa<Instruction>(P.second.front())) {
12626 const unsigned SubVF = getFloorFullVectorNumberOfElements(
12627 *TTI, TE.Scalars.front()->getType(), Sz);
12628 SubVectors.emplace_back(Cnt - Sz, SubVF);
12629 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12630 DemandedElts.clearBit(I);
12631 } else if (!P.second.empty() && isConstant(P.second.front())) {
12632 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12633 DemandedElts.clearBit(I);
12634 }
12635 }
12636 }
12637 }
12638 // Reuses always require shuffles, so consider it as profitable.
12639 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
12640 return;
12641 // Do simple cost estimation.
12644 auto *ScalarTy = TE.Scalars.front()->getType();
12645 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
12646 for (auto [Idx, Sz] : SubVectors) {
12648 Idx, getWidenedType(ScalarTy, Sz));
12649 }
12650 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12651 /*Insert=*/true,
12652 /*Extract=*/false, CostKind);
12653 int Sz = TE.Scalars.size();
12654 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
12655 TE.ReorderIndices.end());
12656 for (unsigned I : seq<unsigned>(Sz)) {
12657 Value *V = TE.getOrdered(I);
12658 if (isa<PoisonValue>(V)) {
12659 ReorderMask[I] = PoisonMaskElem;
12660 } else if (isConstant(V) || DemandedElts[I]) {
12661 ReorderMask[I] = I + TE.ReorderIndices.size();
12662 }
12663 }
12664 Cost += ::getShuffleCost(*TTI,
12665 any_of(ReorderMask, [&](int I) { return I >= Sz; })
12668 VecTy, ReorderMask);
12669 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12670 ReorderMask.assign(Sz, PoisonMaskElem);
12671 for (unsigned I : seq<unsigned>(Sz)) {
12672 Value *V = TE.getOrdered(I);
12673 if (isConstant(V)) {
12674 DemandedElts.clearBit(I);
12675 if (!isa<PoisonValue>(V))
12676 ReorderMask[I] = I;
12677 } else {
12678 ReorderMask[I] = I + Sz;
12679 }
12680 }
12681 InstructionCost BVCost =
12682 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12683 /*Insert=*/true, /*Extract=*/false, CostKind);
12684 if (!DemandedElts.isAllOnes())
12685 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
12686 if (Cost >= BVCost) {
12687 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
12688 reorderScalars(TE.Scalars, Mask);
12689 TE.ReorderIndices.clear();
12690 }
12691}
12692
12693/// Check if we can convert fadd/fsub sequence to FMAD.
12694/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
12696 const InstructionsState &S,
12697 DominatorTree &DT, const DataLayout &DL,
12699 const TargetLibraryInfo &TLI) {
12700 assert(all_of(VL,
12701 [](Value *V) {
12702 return V->getType()->getScalarType()->isFloatingPointTy();
12703 }) &&
12704 "Can only convert to FMA for floating point types");
12705 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12706
12707 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12708 FastMathFlags FMF;
12709 FMF.set();
12710 for (Value *V : VL) {
12711 auto *I = dyn_cast<Instruction>(V);
12712 if (!I)
12713 continue;
12714 if (S.isCopyableElement(I))
12715 continue;
12716 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12717 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12718 continue;
12719 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12720 FMF &= FPCI->getFastMathFlags();
12721 }
12722 return FMF.allowContract();
12723 };
12724 if (!CheckForContractable(VL))
12726 // fmul also should be contractable
12727 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12728 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12729
12730 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
12731 if (!OpS.valid())
12733
12734 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12736 if (!CheckForContractable(Operands.front()))
12738 // Compare the costs.
12739 InstructionCost FMulPlusFAddCost = 0;
12740 InstructionCost FMACost = 0;
12742 FastMathFlags FMF;
12743 FMF.set();
12744 for (Value *V : VL) {
12745 auto *I = dyn_cast<Instruction>(V);
12746 if (!I)
12747 continue;
12748 if (!S.isCopyableElement(I))
12749 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12750 FMF &= FPCI->getFastMathFlags();
12751 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12752 }
12753 unsigned NumOps = 0;
12754 for (auto [V, Op] : zip(VL, Operands.front())) {
12755 if (S.isCopyableElement(V))
12756 continue;
12757 auto *I = dyn_cast<Instruction>(Op);
12758 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
12759 if (auto *OpI = dyn_cast<Instruction>(V))
12760 FMACost += TTI.getInstructionCost(OpI, CostKind);
12761 if (I)
12762 FMACost += TTI.getInstructionCost(I, CostKind);
12763 continue;
12764 }
12765 ++NumOps;
12766 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12767 FMF &= FPCI->getFastMathFlags();
12768 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12769 }
12770 Type *Ty = VL.front()->getType();
12771 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
12772 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
12773 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
12774}
12775
12778 BaseGraphSize = VectorizableTree.size();
12779 // Turn graph transforming mode on and off, when done.
12780 class GraphTransformModeRAAI {
12781 bool &SavedIsGraphTransformMode;
12782
12783 public:
12784 GraphTransformModeRAAI(bool &IsGraphTransformMode)
12785 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12786 IsGraphTransformMode = true;
12787 }
12788 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
12789 } TransformContext(IsGraphTransformMode);
12790 // Operands are profitable if they are:
12791 // 1. At least one constant
12792 // or
12793 // 2. Splats
12794 // or
12795 // 3. Results in good vectorization opportunity, i.e. may generate vector
12796 // nodes and reduce cost of the graph.
12797 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
12798 const InstructionsState &S) {
12800 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
12801 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
12802 I2->getOperand(Op));
12803 return all_of(
12804 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
12805 return all_of(Cand,
12806 [](const std::pair<Value *, Value *> &P) {
12807 return isa<Constant>(P.first) ||
12808 isa<Constant>(P.second) || P.first == P.second;
12809 }) ||
12811 });
12812 };
12813
12814 // Try to reorder gather nodes for better vectorization opportunities.
12815 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12816 TreeEntry &E = *VectorizableTree[Idx];
12817 if (E.isGather())
12818 reorderGatherNode(E);
12819 }
12820
12821 // Better to use full gathered loads analysis, if there are only 2 loads
12822 // gathered nodes each having less than 16 elements.
12823 constexpr unsigned VFLimit = 16;
12824 bool ForceLoadGather =
12825 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12826 return TE->isGather() && TE->hasState() &&
12827 TE->getOpcode() == Instruction::Load &&
12828 TE->getVectorFactor() < VFLimit;
12829 }) == 2;
12830
12831 // Checks if the scalars are used in other node.
12832 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
12833 function_ref<bool(Value *)> CheckContainer) {
12834 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
12835 if (isa<PoisonValue>(V))
12836 return true;
12837 auto *I = dyn_cast<Instruction>(V);
12838 if (!I)
12839 return false;
12840 return is_contained(TE->Scalars, I) || CheckContainer(I);
12841 });
12842 };
12843 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
12844 if (E.hasState()) {
12845 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
12846 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12847 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12848 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12849 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12850 return is_contained(TEs, TE);
12851 });
12852 });
12853 }))
12854 return true;
12855 ;
12856 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
12857 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12858 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12859 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12860 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12861 return is_contained(TEs, TE);
12862 });
12863 });
12864 }))
12865 return true;
12866 } else {
12867 // Check if the gather node full copy of split node.
12868 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
12869 if (It != E.Scalars.end()) {
12870 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
12871 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12872 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12873 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12874 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12875 return is_contained(TEs, TE);
12876 });
12877 });
12878 }))
12879 return true;
12880 }
12881 }
12882 return false;
12883 };
12884 // The tree may grow here, so iterate over nodes, built before.
12885 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12886 TreeEntry &E = *VectorizableTree[Idx];
12887 if (E.isGather()) {
12888 ArrayRef<Value *> VL = E.Scalars;
12889 const unsigned Sz = getVectorElementSize(VL.front());
12890 unsigned MinVF = getMinVF(2 * Sz);
12891 // Do not try partial vectorization for small nodes (<= 2), nodes with the
12892 // same opcode and same parent block or all constants.
12893 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
12894 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
12895 // We use allSameOpcode instead of isAltShuffle because we don't
12896 // want to use interchangeable instruction here.
12897 !allSameOpcode(VL) || !allSameBlock(VL)) ||
12898 allConstant(VL) || isSplat(VL))
12899 continue;
12900 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
12901 continue;
12902 // Check if the node is a copy of other vector nodes.
12903 if (CheckForSameVectorNodes(E))
12904 continue;
12905 // Try to find vectorizable sequences and transform them into a series of
12906 // insertvector instructions.
12907 unsigned StartIdx = 0;
12908 unsigned End = VL.size();
12909 for (unsigned VF = getFloorFullVectorNumberOfElements(
12910 *TTI, VL.front()->getType(), VL.size() - 1);
12911 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
12912 *TTI, VL.front()->getType(), VF - 1)) {
12913 if (StartIdx + VF > End)
12914 continue;
12916 bool AllStrided = true;
12917 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
12918 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
12919 // If any instruction is vectorized already - do not try again.
12920 // Reuse the existing node, if it fully matches the slice.
12921 if (isVectorized(Slice.front()) &&
12922 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
12923 continue;
12924 // Constant already handled effectively - skip.
12925 if (allConstant(Slice))
12926 continue;
12927 // Do not try to vectorize small splats (less than vector register and
12928 // only with the single non-undef element).
12929 bool IsSplat = isSplat(Slice);
12930 bool IsTwoRegisterSplat = true;
12931 if (IsSplat && VF == 2) {
12932 unsigned NumRegs2VF = ::getNumberOfParts(
12933 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
12934 IsTwoRegisterSplat = NumRegs2VF == 2;
12935 }
12936 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
12937 count(Slice, Slice.front()) ==
12938 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
12939 : 1)) {
12940 if (IsSplat)
12941 continue;
12942 InstructionsState S = getSameOpcode(Slice, *TLI);
12943 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
12944 (S.getOpcode() == Instruction::Load &&
12946 (S.getOpcode() != Instruction::Load &&
12947 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
12948 continue;
12949 if (VF == 2) {
12950 // Try to vectorize reduced values or if all users are vectorized.
12951 // For expensive instructions extra extracts might be profitable.
12952 if ((!UserIgnoreList || E.Idx != 0) &&
12953 TTI->getInstructionCost(S.getMainOp(), CostKind) <
12955 !all_of(Slice, [&](Value *V) {
12956 if (isa<PoisonValue>(V))
12957 return true;
12958 return areAllUsersVectorized(cast<Instruction>(V),
12959 UserIgnoreList);
12960 }))
12961 continue;
12962 if (S.getOpcode() == Instruction::Load) {
12963 OrdersType Order;
12964 SmallVector<Value *> PointerOps;
12965 StridedPtrInfo SPtrInfo;
12966 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
12967 PointerOps, SPtrInfo);
12968 AllStrided &= Res == LoadsState::StridedVectorize ||
12970 Res == LoadsState::Gather;
12971 // Do not vectorize gathers.
12972 if (Res == LoadsState::ScatterVectorize ||
12973 Res == LoadsState::Gather) {
12974 if (Res == LoadsState::Gather) {
12976 // If reductions and the scalars from the root node are
12977 // analyzed - mark as non-vectorizable reduction.
12978 if (UserIgnoreList && E.Idx == 0)
12979 analyzedReductionVals(Slice);
12980 }
12981 continue;
12982 }
12983 } else if (S.getOpcode() == Instruction::ExtractElement ||
12984 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
12986 !CheckOperandsProfitability(
12987 S.getMainOp(),
12990 S))) {
12991 // Do not vectorize extractelements (handled effectively
12992 // alread). Do not vectorize non-profitable instructions (with
12993 // low cost and non-vectorizable operands.)
12994 continue;
12995 }
12996 }
12997 }
12998 Slices.emplace_back(Cnt, Slice.size());
12999 }
13000 // Do not try to vectorize if all slides are strided or gathered with
13001 // vector factor 2 and there are more than 2 slices. Better to handle
13002 // them in gathered loads analysis, may result in better vectorization.
13003 if (VF == 2 && AllStrided && Slices.size() > 2)
13004 continue;
13005 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13006 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13007 if (StartIdx == Cnt)
13008 StartIdx = Cnt + Sz;
13009 if (End == Cnt + Sz)
13010 End = Cnt;
13011 };
13012 for (auto [Cnt, Sz] : Slices) {
13013 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13014 const TreeEntry *SameTE = nullptr;
13015 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13016 It != Slice.end()) {
13017 // If any instruction is vectorized already - do not try again.
13018 SameTE = getSameValuesTreeEntry(*It, Slice);
13019 }
13020 unsigned PrevSize = VectorizableTree.size();
13021 [[maybe_unused]] unsigned PrevEntriesSize =
13022 LoadEntriesToVectorize.size();
13023 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13024 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13025 VectorizableTree[PrevSize]->isGather() &&
13026 VectorizableTree[PrevSize]->hasState() &&
13027 VectorizableTree[PrevSize]->getOpcode() !=
13028 Instruction::ExtractElement &&
13029 !isSplat(Slice)) {
13030 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13031 analyzedReductionVals(Slice);
13032 VectorizableTree.pop_back();
13033 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13034 "LoadEntriesToVectorize expected to remain the same");
13035 continue;
13036 }
13037 AddCombinedNode(PrevSize, Cnt, Sz);
13038 }
13039 }
13040 // Restore ordering, if no extra vectorization happened.
13041 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13042 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13043 reorderScalars(E.Scalars, Mask);
13044 E.ReorderIndices.clear();
13045 }
13046 }
13047 if (!E.hasState())
13048 continue;
13049 switch (E.getOpcode()) {
13050 case Instruction::Load: {
13051 // No need to reorder masked gather loads, just reorder the scalar
13052 // operands.
13053 if (E.State != TreeEntry::Vectorize)
13054 break;
13055 Type *ScalarTy = E.getMainOp()->getType();
13056 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13057 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13058 // Check if profitable to represent consecutive load + reverse as strided
13059 // load with stride -1.
13060 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13061 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13062 SmallVector<int> Mask;
13063 inversePermutation(E.ReorderIndices, Mask);
13064 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13065 InstructionCost OriginalVecCost =
13066 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13067 BaseLI->getPointerAddressSpace(), CostKind,
13069 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13070 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13071 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13072 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
13073 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13074 // Strided load is more profitable than consecutive load + reverse -
13075 // transform the node to strided load.
13076 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13077 ->getPointerOperand()
13078 ->getType());
13079 StridedPtrInfo SPtrInfo;
13080 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13081 SPtrInfo.Ty = VecTy;
13082 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13083 E.State = TreeEntry::StridedVectorize;
13084 }
13085 }
13086 break;
13087 }
13088 case Instruction::Store: {
13089 Type *ScalarTy =
13090 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13091 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13092 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13093 // Check if profitable to represent consecutive load + reverse as strided
13094 // load with stride -1.
13095 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13096 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13097 SmallVector<int> Mask;
13098 inversePermutation(E.ReorderIndices, Mask);
13099 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13100 InstructionCost OriginalVecCost =
13101 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13102 BaseSI->getPointerAddressSpace(), CostKind,
13104 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13105 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13106 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13107 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
13108 if (StridedCost < OriginalVecCost)
13109 // Strided store is more profitable than reverse + consecutive store -
13110 // transform the node to strided store.
13111 E.State = TreeEntry::StridedVectorize;
13112 } else if (!E.ReorderIndices.empty()) {
13113 // Check for interleaved stores.
13114 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13115 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13116 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13117 if (Mask.size() < 4)
13118 return 0u;
13119 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13121 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13122 TTI.isLegalInterleavedAccessType(
13123 VecTy, Factor, BaseSI->getAlign(),
13124 BaseSI->getPointerAddressSpace()))
13125 return Factor;
13126 }
13127
13128 return 0u;
13129 };
13130 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13131 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13132 if (InterleaveFactor != 0)
13133 E.setInterleave(InterleaveFactor);
13134 }
13135 break;
13136 }
13137 case Instruction::Select: {
13138 if (E.State != TreeEntry::Vectorize)
13139 break;
13140 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13141 if (MinMaxID == Intrinsic::not_intrinsic)
13142 break;
13143 // This node is a minmax node.
13144 E.CombinedOp = TreeEntry::MinMax;
13145 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13146 if (SelectOnly && CondEntry->UserTreeIndex &&
13147 CondEntry->State == TreeEntry::Vectorize) {
13148 // The condition node is part of the combined minmax node.
13149 CondEntry->State = TreeEntry::CombinedVectorize;
13150 }
13151 break;
13152 }
13153 case Instruction::FSub:
13154 case Instruction::FAdd: {
13155 // Check if possible to convert (a*b)+c to fma.
13156 if (E.State != TreeEntry::Vectorize ||
13157 !E.getOperations().isAddSubLikeOp())
13158 break;
13159 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13160 .isValid())
13161 break;
13162 // This node is a fmuladd node.
13163 E.CombinedOp = TreeEntry::FMulAdd;
13164 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13165 if (FMulEntry->UserTreeIndex &&
13166 FMulEntry->State == TreeEntry::Vectorize) {
13167 // The FMul node is part of the combined fmuladd node.
13168 FMulEntry->State = TreeEntry::CombinedVectorize;
13169 }
13170 break;
13171 }
13172 default:
13173 break;
13174 }
13175 }
13176
13177 if (LoadEntriesToVectorize.empty()) {
13178 // Single load node - exit.
13179 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13180 VectorizableTree.front()->getOpcode() == Instruction::Load)
13181 return;
13182 // Small graph with small VF - exit.
13183 constexpr unsigned SmallTree = 3;
13184 constexpr unsigned SmallVF = 2;
13185 if ((VectorizableTree.size() <= SmallTree &&
13186 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13187 (VectorizableTree.size() <= 2 && UserIgnoreList))
13188 return;
13189
13190 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13191 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13192 getCanonicalGraphSize() <= SmallTree &&
13193 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13194 [](const std::unique_ptr<TreeEntry> &TE) {
13195 return TE->isGather() && TE->hasState() &&
13196 TE->getOpcode() == Instruction::Load &&
13197 !allSameBlock(TE->Scalars);
13198 }) == 1)
13199 return;
13200 }
13201
13202 // A list of loads to be gathered during the vectorization process. We can
13203 // try to vectorize them at the end, if profitable.
13204 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13206 GatheredLoads;
13207
13208 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13209 TreeEntry &E = *TE;
13210 if (E.isGather() &&
13211 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13212 (!E.hasState() && any_of(E.Scalars,
13213 [&](Value *V) {
13214 return isa<LoadInst>(V) &&
13215 !isVectorized(V) &&
13216 !isDeleted(cast<Instruction>(V));
13217 }))) &&
13218 !isSplat(E.Scalars)) {
13219 for (Value *V : E.Scalars) {
13220 auto *LI = dyn_cast<LoadInst>(V);
13221 if (!LI)
13222 continue;
13223 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13224 continue;
13226 *this, V, *DL, *SE, *TTI,
13227 GatheredLoads[std::make_tuple(
13228 LI->getParent(),
13229 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13230 LI->getType())]);
13231 }
13232 }
13233 }
13234 // Try to vectorize gathered loads if this is not just a gather of loads.
13235 if (!GatheredLoads.empty())
13236 tryToVectorizeGatheredLoads(GatheredLoads);
13237}
13238
13239/// Merges shuffle masks and emits final shuffle instruction, if required. It
13240/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13241/// when the actual shuffle instruction is generated only if this is actually
13242/// required. Otherwise, the shuffle instruction emission is delayed till the
13243/// end of the process, to reduce the number of emitted instructions and further
13244/// analysis/transformations.
13245class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13246 bool IsFinalized = false;
13247 SmallVector<int> CommonMask;
13249 const TargetTransformInfo &TTI;
13250 InstructionCost Cost = 0;
13251 SmallDenseSet<Value *> VectorizedVals;
13252 BoUpSLP &R;
13253 SmallPtrSetImpl<Value *> &CheckedExtracts;
13254 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13255 /// While set, still trying to estimate the cost for the same nodes and we
13256 /// can delay actual cost estimation (virtual shuffle instruction emission).
13257 /// May help better estimate the cost if same nodes must be permuted + allows
13258 /// to move most of the long shuffles cost estimation to TTI.
13259 bool SameNodesEstimated = true;
13260
13261 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13262 if (Ty->getScalarType()->isPointerTy()) {
13265 IntegerType::get(Ty->getContext(),
13266 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13267 Ty->getScalarType());
13268 if (auto *VTy = dyn_cast<VectorType>(Ty))
13269 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13270 return Res;
13271 }
13272 return Constant::getAllOnesValue(Ty);
13273 }
13274
13275 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13276 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13277 return TTI::TCC_Free;
13278 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13279 InstructionCost GatherCost = 0;
13280 SmallVector<Value *> Gathers(VL);
13281 if (!Root && isSplat(VL)) {
13282 // Found the broadcasting of the single scalar, calculate the cost as
13283 // the broadcast.
13284 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13285 assert(It != VL.end() && "Expected at least one non-undef value.");
13286 // Add broadcast for non-identity shuffle only.
13287 bool NeedShuffle =
13288 count(VL, *It) > 1 &&
13289 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13290 if (!NeedShuffle) {
13291 if (isa<FixedVectorType>(ScalarTy)) {
13292 assert(SLPReVec && "FixedVectorType is not expected.");
13293 return TTI.getShuffleCost(
13294 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13295 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13296 cast<FixedVectorType>(ScalarTy));
13297 }
13298 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13299 CostKind, std::distance(VL.begin(), It),
13300 PoisonValue::get(VecTy), *It);
13301 }
13302
13303 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13304 transform(VL, ShuffleMask.begin(), [](Value *V) {
13305 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13306 });
13307 InstructionCost InsertCost =
13308 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13309 PoisonValue::get(VecTy), *It);
13310 return InsertCost + ::getShuffleCost(TTI,
13312 VecTy, ShuffleMask, CostKind,
13313 /*Index=*/0, /*SubTp=*/nullptr,
13314 /*Args=*/*It);
13315 }
13316 return GatherCost +
13317 (all_of(Gathers, IsaPred<UndefValue>)
13319 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13320 ScalarTy));
13321 };
13322
13323 /// Compute the cost of creating a vector containing the extracted values from
13324 /// \p VL.
13326 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13327 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13328 unsigned NumParts) {
13329 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13330 unsigned NumElts =
13331 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13332 auto *EE = dyn_cast<ExtractElementInst>(V);
13333 if (!EE)
13334 return Sz;
13335 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13336 if (!VecTy)
13337 return Sz;
13338 return std::max(Sz, VecTy->getNumElements());
13339 });
13340 // FIXME: this must be moved to TTI for better estimation.
13341 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13342 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13344 SmallVectorImpl<unsigned> &SubVecSizes)
13345 -> std::optional<TTI::ShuffleKind> {
13346 if (NumElts <= EltsPerVector)
13347 return std::nullopt;
13348 int OffsetReg0 =
13349 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13350 [](int S, int I) {
13351 if (I == PoisonMaskElem)
13352 return S;
13353 return std::min(S, I);
13354 }),
13355 EltsPerVector);
13356 int OffsetReg1 = OffsetReg0;
13357 DenseSet<int> RegIndices;
13358 // Check that if trying to permute same single/2 input vectors.
13360 int FirstRegId = -1;
13361 Indices.assign(1, OffsetReg0);
13362 for (auto [Pos, I] : enumerate(Mask)) {
13363 if (I == PoisonMaskElem)
13364 continue;
13365 int Idx = I - OffsetReg0;
13366 int RegId =
13367 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13368 if (FirstRegId < 0)
13369 FirstRegId = RegId;
13370 RegIndices.insert(RegId);
13371 if (RegIndices.size() > 2)
13372 return std::nullopt;
13373 if (RegIndices.size() == 2) {
13374 ShuffleKind = TTI::SK_PermuteTwoSrc;
13375 if (Indices.size() == 1) {
13376 OffsetReg1 = alignDown(
13377 std::accumulate(
13378 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13379 [&](int S, int I) {
13380 if (I == PoisonMaskElem)
13381 return S;
13382 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13383 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13384 if (RegId == FirstRegId)
13385 return S;
13386 return std::min(S, I);
13387 }),
13388 EltsPerVector);
13389 unsigned Index = OffsetReg1 % NumElts;
13390 Indices.push_back(Index);
13391 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13392 }
13393 Idx = I - OffsetReg1;
13394 }
13395 I = (Idx % NumElts) % EltsPerVector +
13396 (RegId == FirstRegId ? 0 : EltsPerVector);
13397 }
13398 return ShuffleKind;
13399 };
13400 InstructionCost Cost = 0;
13401
13402 // Process extracts in blocks of EltsPerVector to check if the source vector
13403 // operand can be re-used directly. If not, add the cost of creating a
13404 // shuffle to extract the values into a vector register.
13405 for (unsigned Part : seq<unsigned>(NumParts)) {
13406 if (!ShuffleKinds[Part])
13407 continue;
13408 ArrayRef<int> MaskSlice = Mask.slice(
13409 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13410 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13411 copy(MaskSlice, SubMask.begin());
13413 SmallVector<unsigned, 2> SubVecSizes;
13414 std::optional<TTI::ShuffleKind> RegShuffleKind =
13415 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13416 if (!RegShuffleKind) {
13417 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13419 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13420 Cost +=
13421 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13422 getWidenedType(ScalarTy, NumElts), MaskSlice);
13423 continue;
13424 }
13425 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13426 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13427 Cost +=
13428 ::getShuffleCost(TTI, *RegShuffleKind,
13429 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13430 }
13431 const unsigned BaseVF = getFullVectorNumberOfElements(
13432 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13433 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13434 assert((Idx + SubVecSize) <= BaseVF &&
13435 "SK_ExtractSubvector index out of range");
13437 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13438 Idx, getWidenedType(ScalarTy, SubVecSize));
13439 }
13440 // Second attempt to check, if just a permute is better estimated than
13441 // subvector extract.
13442 SubMask.assign(NumElts, PoisonMaskElem);
13443 copy(MaskSlice, SubMask.begin());
13444 InstructionCost OriginalCost = ::getShuffleCost(
13445 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13446 if (OriginalCost < Cost)
13447 Cost = OriginalCost;
13448 }
13449 return Cost;
13450 }
13451 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13452 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13453 /// elements.
13454 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13455 ArrayRef<int> Mask, unsigned Part,
13456 unsigned SliceSize) {
13457 if (SameNodesEstimated) {
13458 // Delay the cost estimation if the same nodes are reshuffling.
13459 // If we already requested the cost of reshuffling of E1 and E2 before, no
13460 // need to estimate another cost with the sub-Mask, instead include this
13461 // sub-Mask into the CommonMask to estimate it later and avoid double cost
13462 // estimation.
13463 if ((InVectors.size() == 2 &&
13464 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
13465 cast<const TreeEntry *>(InVectors.back()) == E2) ||
13466 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
13467 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
13468 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
13469 [](int Idx) { return Idx == PoisonMaskElem; }) &&
13470 "Expected all poisoned elements.");
13471 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
13472 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13473 return;
13474 }
13475 // Found non-matching nodes - need to estimate the cost for the matched
13476 // and transform mask.
13477 Cost += createShuffle(InVectors.front(),
13478 InVectors.size() == 1 ? nullptr : InVectors.back(),
13479 CommonMask);
13480 transformMaskAfterShuffle(CommonMask, CommonMask);
13481 } else if (InVectors.size() == 2) {
13482 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13483 transformMaskAfterShuffle(CommonMask, CommonMask);
13484 }
13485 SameNodesEstimated = false;
13486 if (!E2 && InVectors.size() == 1) {
13487 unsigned VF = E1.getVectorFactor();
13488 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
13489 VF = std::max(VF, getVF(V1));
13490 } else {
13491 const auto *E = cast<const TreeEntry *>(InVectors.front());
13492 VF = std::max(VF, E->getVectorFactor());
13493 }
13494 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13495 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13496 CommonMask[Idx] = Mask[Idx] + VF;
13497 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13498 transformMaskAfterShuffle(CommonMask, CommonMask);
13499 } else {
13500 auto P = InVectors.front();
13501 Cost += createShuffle(&E1, E2, Mask);
13502 unsigned VF = Mask.size();
13503 if (Value *V1 = dyn_cast<Value *>(P)) {
13504 VF = std::max(VF,
13505 getNumElements(V1->getType()));
13506 } else {
13507 const auto *E = cast<const TreeEntry *>(P);
13508 VF = std::max(VF, E->getVectorFactor());
13509 }
13510 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13511 if (Mask[Idx] != PoisonMaskElem)
13512 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13513 Cost += createShuffle(P, InVectors.front(), CommonMask);
13514 transformMaskAfterShuffle(CommonMask, CommonMask);
13515 }
13516 }
13517
13518 class ShuffleCostBuilder {
13519 const TargetTransformInfo &TTI;
13520
13521 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
13522 int Index = -1;
13523 return Mask.empty() ||
13524 (VF == Mask.size() &&
13527 Index == 0);
13528 }
13529
13530 public:
13531 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
13532 ~ShuffleCostBuilder() = default;
13533 InstructionCost createShuffleVector(Value *V1, Value *,
13534 ArrayRef<int> Mask) const {
13535 // Empty mask or identity mask are free.
13536 unsigned VF =
13537 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13538 if (isEmptyOrIdentity(Mask, VF))
13539 return TTI::TCC_Free;
13540 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
13541 cast<VectorType>(V1->getType()), Mask);
13542 }
13543 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
13544 // Empty mask or identity mask are free.
13545 unsigned VF =
13546 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13547 if (isEmptyOrIdentity(Mask, VF))
13548 return TTI::TCC_Free;
13549 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
13550 cast<VectorType>(V1->getType()), Mask);
13551 }
13552 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
13553 InstructionCost createPoison(Type *Ty, unsigned VF) const {
13554 return TTI::TCC_Free;
13555 }
13556 void resizeToMatch(Value *&, Value *&) const {}
13557 };
13558
13559 /// Smart shuffle instruction emission, walks through shuffles trees and
13560 /// tries to find the best matching vector for the actual shuffle
13561 /// instruction.
13563 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
13565 ArrayRef<int> Mask) {
13566 ShuffleCostBuilder Builder(TTI);
13567 SmallVector<int> CommonMask(Mask);
13568 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
13569 unsigned CommonVF = Mask.size();
13570 InstructionCost ExtraCost = 0;
13571 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
13572 unsigned VF) -> InstructionCost {
13573 if (E.isGather() && allConstant(E.Scalars))
13574 return TTI::TCC_Free;
13575 Type *EScalarTy = E.Scalars.front()->getType();
13576 bool IsSigned = true;
13577 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13578 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
13579 IsSigned = It->second.second;
13580 }
13581 if (EScalarTy != ScalarTy) {
13582 unsigned CastOpcode = Instruction::Trunc;
13583 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13584 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13585 if (DstSz > SrcSz)
13586 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13587 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
13588 getWidenedType(EScalarTy, VF),
13589 TTI::CastContextHint::None, CostKind);
13590 }
13591 return TTI::TCC_Free;
13592 };
13593 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
13594 if (isa<Constant>(V))
13595 return TTI::TCC_Free;
13596 auto *VecTy = cast<VectorType>(V->getType());
13597 Type *EScalarTy = VecTy->getElementType();
13598 if (EScalarTy != ScalarTy) {
13599 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
13600 unsigned CastOpcode = Instruction::Trunc;
13601 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13602 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13603 if (DstSz > SrcSz)
13604 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13605 return TTI.getCastInstrCost(
13606 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
13607 VecTy, TTI::CastContextHint::None, CostKind);
13608 }
13609 return TTI::TCC_Free;
13610 };
13611 if (!V1 && !V2 && !P2.isNull()) {
13612 // Shuffle 2 entry nodes.
13613 const TreeEntry *E = cast<const TreeEntry *>(P1);
13614 unsigned VF = E->getVectorFactor();
13615 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13616 CommonVF = std::max(VF, E2->getVectorFactor());
13617 assert(all_of(Mask,
13618 [=](int Idx) {
13619 return Idx < 2 * static_cast<int>(CommonVF);
13620 }) &&
13621 "All elements in mask must be less than 2 * CommonVF.");
13622 if (E->Scalars.size() == E2->Scalars.size()) {
13623 SmallVector<int> EMask = E->getCommonMask();
13624 SmallVector<int> E2Mask = E2->getCommonMask();
13625 if (!EMask.empty() || !E2Mask.empty()) {
13626 for (int &Idx : CommonMask) {
13627 if (Idx == PoisonMaskElem)
13628 continue;
13629 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
13630 Idx = EMask[Idx];
13631 else if (Idx >= static_cast<int>(CommonVF))
13632 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13633 E->Scalars.size();
13634 }
13635 }
13636 CommonVF = E->Scalars.size();
13637 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13638 GetNodeMinBWAffectedCost(*E2, CommonVF);
13639 } else {
13640 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13641 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13642 }
13643 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13644 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13645 } else if (!V1 && P2.isNull()) {
13646 // Shuffle single entry node.
13647 const TreeEntry *E = cast<const TreeEntry *>(P1);
13648 unsigned VF = E->getVectorFactor();
13649 CommonVF = VF;
13650 assert(
13651 all_of(Mask,
13652 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13653 "All elements in mask must be less than CommonVF.");
13654 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13655 SmallVector<int> EMask = E->getCommonMask();
13656 assert(!EMask.empty() && "Expected non-empty common mask.");
13657 for (int &Idx : CommonMask) {
13658 if (Idx != PoisonMaskElem)
13659 Idx = EMask[Idx];
13660 }
13661 CommonVF = E->Scalars.size();
13662 } else if (unsigned Factor = E->getInterleaveFactor();
13663 Factor > 0 && E->Scalars.size() != Mask.size() &&
13665 Factor)) {
13666 // Deinterleaved nodes are free.
13667 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13668 }
13669 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13670 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13671 // Not identity/broadcast? Try to see if the original vector is better.
13672 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13673 CommonVF == CommonMask.size() &&
13674 any_of(enumerate(CommonMask),
13675 [](const auto &&P) {
13676 return P.value() != PoisonMaskElem &&
13677 static_cast<unsigned>(P.value()) != P.index();
13678 }) &&
13679 any_of(CommonMask,
13680 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
13681 SmallVector<int> ReorderMask;
13682 inversePermutation(E->ReorderIndices, ReorderMask);
13683 ::addMask(CommonMask, ReorderMask);
13684 }
13685 } else if (V1 && P2.isNull()) {
13686 // Shuffle single vector.
13687 ExtraCost += GetValueMinBWAffectedCost(V1);
13688 CommonVF = getVF(V1);
13689 assert(
13690 all_of(Mask,
13691 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13692 "All elements in mask must be less than CommonVF.");
13693 } else if (V1 && !V2) {
13694 // Shuffle vector and tree node.
13695 unsigned VF = getVF(V1);
13696 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13697 CommonVF = std::max(VF, E2->getVectorFactor());
13698 assert(all_of(Mask,
13699 [=](int Idx) {
13700 return Idx < 2 * static_cast<int>(CommonVF);
13701 }) &&
13702 "All elements in mask must be less than 2 * CommonVF.");
13703 if (E2->Scalars.size() == VF && VF != CommonVF) {
13704 SmallVector<int> E2Mask = E2->getCommonMask();
13705 assert(!E2Mask.empty() && "Expected non-empty common mask.");
13706 for (int &Idx : CommonMask) {
13707 if (Idx == PoisonMaskElem)
13708 continue;
13709 if (Idx >= static_cast<int>(CommonVF))
13710 Idx = E2Mask[Idx - CommonVF] + VF;
13711 }
13712 CommonVF = VF;
13713 }
13714 ExtraCost += GetValueMinBWAffectedCost(V1);
13715 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13716 ExtraCost += GetNodeMinBWAffectedCost(
13717 *E2, std::min(CommonVF, E2->getVectorFactor()));
13718 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13719 } else if (!V1 && V2) {
13720 // Shuffle vector and tree node.
13721 unsigned VF = getVF(V2);
13722 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13723 CommonVF = std::max(VF, E1->getVectorFactor());
13724 assert(all_of(Mask,
13725 [=](int Idx) {
13726 return Idx < 2 * static_cast<int>(CommonVF);
13727 }) &&
13728 "All elements in mask must be less than 2 * CommonVF.");
13729 if (E1->Scalars.size() == VF && VF != CommonVF) {
13730 SmallVector<int> E1Mask = E1->getCommonMask();
13731 assert(!E1Mask.empty() && "Expected non-empty common mask.");
13732 for (int &Idx : CommonMask) {
13733 if (Idx == PoisonMaskElem)
13734 continue;
13735 if (Idx >= static_cast<int>(CommonVF))
13736 Idx = E1Mask[Idx - CommonVF] + VF;
13737 else
13738 Idx = E1Mask[Idx];
13739 }
13740 CommonVF = VF;
13741 }
13742 ExtraCost += GetNodeMinBWAffectedCost(
13743 *E1, std::min(CommonVF, E1->getVectorFactor()));
13744 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13745 ExtraCost += GetValueMinBWAffectedCost(V2);
13746 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13747 } else {
13748 assert(V1 && V2 && "Expected both vectors.");
13749 unsigned VF = getVF(V1);
13750 CommonVF = std::max(VF, getVF(V2));
13751 assert(all_of(Mask,
13752 [=](int Idx) {
13753 return Idx < 2 * static_cast<int>(CommonVF);
13754 }) &&
13755 "All elements in mask must be less than 2 * CommonVF.");
13756 ExtraCost +=
13757 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13758 if (V1->getType() != V2->getType()) {
13759 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13760 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13761 } else {
13762 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
13763 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13764 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
13765 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13766 }
13767 }
13768 InVectors.front() =
13769 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13770 if (InVectors.size() == 2)
13771 InVectors.pop_back();
13772 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13773 V1, V2, CommonMask, Builder, ScalarTy);
13774 }
13775
13776public:
13778 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
13779 SmallPtrSetImpl<Value *> &CheckedExtracts)
13780 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13781 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13782 CheckedExtracts(CheckedExtracts) {}
13783 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
13784 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13785 unsigned NumParts, bool &UseVecBaseAsInput) {
13786 UseVecBaseAsInput = false;
13787 if (Mask.empty())
13788 return nullptr;
13789 Value *VecBase = nullptr;
13790 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
13791 if (!E->ReorderIndices.empty()) {
13792 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
13793 E->ReorderIndices.end());
13794 reorderScalars(VL, ReorderMask);
13795 }
13796 // Check if it can be considered reused if same extractelements were
13797 // vectorized already.
13798 bool PrevNodeFound = any_of(
13799 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13800 [&](const std::unique_ptr<TreeEntry> &TE) {
13801 return ((TE->hasState() && !TE->isAltShuffle() &&
13802 TE->getOpcode() == Instruction::ExtractElement) ||
13803 TE->isGather()) &&
13804 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13805 return VL.size() > Data.index() &&
13806 (Mask[Data.index()] == PoisonMaskElem ||
13807 isa<UndefValue>(VL[Data.index()]) ||
13808 Data.value() == VL[Data.index()]);
13809 });
13810 });
13811 SmallPtrSet<Value *, 4> UniqueBases;
13812 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13813 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
13814 for (unsigned Part : seq<unsigned>(NumParts)) {
13815 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
13816 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13817 for (auto [I, V] :
13818 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
13819 // Ignore non-extractelement scalars.
13820 if (isa<UndefValue>(V) ||
13821 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
13822 continue;
13823 // If all users of instruction are going to be vectorized and this
13824 // instruction itself is not going to be vectorized, consider this
13825 // instruction as dead and remove its cost from the final cost of the
13826 // vectorized tree.
13827 // Also, avoid adjusting the cost for extractelements with multiple uses
13828 // in different graph entries.
13829 auto *EE = cast<ExtractElementInst>(V);
13830 VecBase = EE->getVectorOperand();
13831 UniqueBases.insert(VecBase);
13832 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
13833 if (!CheckedExtracts.insert(V).second ||
13834 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
13835 any_of(EE->users(),
13836 [&](User *U) {
13837 return isa<GetElementPtrInst>(U) &&
13838 !R.areAllUsersVectorized(cast<Instruction>(U),
13839 &VectorizedVals);
13840 }) ||
13841 (!VEs.empty() && !is_contained(VEs, E)))
13842 continue;
13843 std::optional<unsigned> EEIdx = getExtractIndex(EE);
13844 if (!EEIdx)
13845 continue;
13846 unsigned Idx = *EEIdx;
13847 // Take credit for instruction that will become dead.
13848 if (EE->hasOneUse() || !PrevNodeFound) {
13849 Instruction *Ext = EE->user_back();
13850 if (isa<SExtInst, ZExtInst>(Ext) &&
13851 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
13852 // Use getExtractWithExtendCost() to calculate the cost of
13853 // extractelement/ext pair.
13854 Cost -= TTI.getExtractWithExtendCost(
13855 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13856 Idx, CostKind);
13857 // Add back the cost of s|zext which is subtracted separately.
13858 Cost += TTI.getCastInstrCost(
13859 Ext->getOpcode(), Ext->getType(), EE->getType(),
13861 continue;
13862 }
13863 }
13864 APInt &DemandedElts =
13865 VectorOpsToExtracts
13866 .try_emplace(VecBase,
13867 APInt::getZero(getNumElements(VecBase->getType())))
13868 .first->getSecond();
13869 DemandedElts.setBit(Idx);
13870 }
13871 }
13872 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13874 DemandedElts, /*Insert=*/false,
13875 /*Extract=*/true, CostKind);
13876 // Check that gather of extractelements can be represented as just a
13877 // shuffle of a single/two vectors the scalars are extracted from.
13878 // Found the bunch of extractelement instructions that must be gathered
13879 // into a vector and can be represented as a permutation elements in a
13880 // single input vector or of 2 input vectors.
13881 // Done for reused if same extractelements were vectorized already.
13882 if (!PrevNodeFound)
13883 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
13884 InVectors.assign(1, E);
13885 CommonMask.assign(Mask.begin(), Mask.end());
13886 transformMaskAfterShuffle(CommonMask, CommonMask);
13887 SameNodesEstimated = false;
13888 if (NumParts != 1 && UniqueBases.size() != 1) {
13889 UseVecBaseAsInput = true;
13890 VecBase =
13891 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13892 }
13893 return VecBase;
13894 }
13895 /// Checks if the specified entry \p E needs to be delayed because of its
13896 /// dependency nodes.
13897 std::optional<InstructionCost>
13898 needToDelay(const TreeEntry *,
13900 // No need to delay the cost estimation during analysis.
13901 return std::nullopt;
13902 }
13903 /// Reset the builder to handle perfect diamond match.
13905 IsFinalized = false;
13906 CommonMask.clear();
13907 InVectors.clear();
13908 Cost = 0;
13909 VectorizedVals.clear();
13910 SameNodesEstimated = true;
13911 }
13912 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
13913 if (&E1 == &E2) {
13914 assert(all_of(Mask,
13915 [&](int Idx) {
13916 return Idx < static_cast<int>(E1.getVectorFactor());
13917 }) &&
13918 "Expected single vector shuffle mask.");
13919 add(E1, Mask);
13920 return;
13921 }
13922 if (InVectors.empty()) {
13923 CommonMask.assign(Mask.begin(), Mask.end());
13924 InVectors.assign({&E1, &E2});
13925 return;
13926 }
13927 assert(!CommonMask.empty() && "Expected non-empty common mask.");
13928 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
13929 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
13930 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
13931 const auto *It =
13932 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
13933 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13934 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
13935 }
13936 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
13937 if (InVectors.empty()) {
13938 CommonMask.assign(Mask.begin(), Mask.end());
13939 InVectors.assign(1, &E1);
13940 return;
13941 }
13942 assert(!CommonMask.empty() && "Expected non-empty common mask.");
13943 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
13944 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
13945 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
13946 const auto *It =
13947 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
13948 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
13949 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
13950 if (!SameNodesEstimated && InVectors.size() == 1)
13951 InVectors.emplace_back(&E1);
13952 }
13953 /// Adds 2 input vectors and the mask for their shuffling.
13954 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
13955 // May come only for shuffling of 2 vectors with extractelements, already
13956 // handled in adjustExtracts.
13957 assert(InVectors.size() == 1 &&
13958 all_of(enumerate(CommonMask),
13959 [&](auto P) {
13960 if (P.value() == PoisonMaskElem)
13961 return Mask[P.index()] == PoisonMaskElem;
13962 auto *EI = cast<ExtractElementInst>(
13963 cast<const TreeEntry *>(InVectors.front())
13964 ->getOrdered(P.index()));
13965 return EI->getVectorOperand() == V1 ||
13966 EI->getVectorOperand() == V2;
13967 }) &&
13968 "Expected extractelement vectors.");
13969 }
13970 /// Adds another one input vector and the mask for the shuffling.
13971 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
13972 if (InVectors.empty()) {
13973 assert(CommonMask.empty() && !ForExtracts &&
13974 "Expected empty input mask/vectors.");
13975 CommonMask.assign(Mask.begin(), Mask.end());
13976 InVectors.assign(1, V1);
13977 return;
13978 }
13979 if (ForExtracts) {
13980 // No need to add vectors here, already handled them in adjustExtracts.
13981 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
13982 !CommonMask.empty() &&
13983 all_of(enumerate(CommonMask),
13984 [&](auto P) {
13985 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
13986 ->getOrdered(P.index());
13987 if (P.value() == PoisonMaskElem)
13988 return P.value() == Mask[P.index()] ||
13989 isa<UndefValue>(Scalar);
13990 if (isa<Constant>(V1))
13991 return true;
13992 auto *EI = cast<ExtractElementInst>(Scalar);
13993 return EI->getVectorOperand() == V1;
13994 }) &&
13995 "Expected only tree entry for extractelement vectors.");
13996 return;
13997 }
13998 assert(!InVectors.empty() && !CommonMask.empty() &&
13999 "Expected only tree entries from extracts/reused buildvectors.");
14000 unsigned VF = getVF(V1);
14001 if (InVectors.size() == 2) {
14002 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14003 transformMaskAfterShuffle(CommonMask, CommonMask);
14004 VF = std::max<unsigned>(VF, CommonMask.size());
14005 } else if (const auto *InTE =
14006 InVectors.front().dyn_cast<const TreeEntry *>()) {
14007 VF = std::max(VF, InTE->getVectorFactor());
14008 } else {
14009 VF = std::max(
14010 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14011 ->getNumElements());
14012 }
14013 InVectors.push_back(V1);
14014 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14015 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14016 CommonMask[Idx] = Mask[Idx] + VF;
14017 }
14018 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14019 Value *Root = nullptr) {
14020 Cost += getBuildVectorCost(VL, Root);
14021 if (!Root) {
14022 // FIXME: Need to find a way to avoid use of getNullValue here.
14024 unsigned VF = VL.size();
14025 if (MaskVF != 0)
14026 VF = std::min(VF, MaskVF);
14027 Type *VLScalarTy = VL.front()->getType();
14028 for (Value *V : VL.take_front(VF)) {
14029 Type *ScalarTy = VLScalarTy->getScalarType();
14030 if (isa<PoisonValue>(V)) {
14031 Vals.push_back(PoisonValue::get(ScalarTy));
14032 continue;
14033 }
14034 if (isa<UndefValue>(V)) {
14035 Vals.push_back(UndefValue::get(ScalarTy));
14036 continue;
14037 }
14038 Vals.push_back(Constant::getNullValue(ScalarTy));
14039 }
14040 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14041 assert(SLPReVec && "FixedVectorType is not expected.");
14042 // When REVEC is enabled, we need to expand vector types into scalar
14043 // types.
14044 Vals = replicateMask(Vals, VecTy->getNumElements());
14045 }
14046 return ConstantVector::get(Vals);
14047 }
14050 cast<FixedVectorType>(Root->getType())->getNumElements()),
14051 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14052 }
14054 /// Finalize emission of the shuffles.
14056 ArrayRef<int> ExtMask,
14057 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14058 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14061 Action = {}) {
14062 IsFinalized = true;
14063 if (Action) {
14064 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14065 if (InVectors.size() == 2)
14066 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14067 else
14068 Cost += createShuffle(Vec, nullptr, CommonMask);
14069 transformMaskAfterShuffle(CommonMask, CommonMask);
14070 assert(VF > 0 &&
14071 "Expected vector length for the final value before action.");
14072 Value *V = cast<Value *>(Vec);
14073 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14074 Cost += createShuffle(V1, V2, Mask);
14075 return V1;
14076 });
14077 InVectors.front() = V;
14078 }
14079 if (!SubVectors.empty()) {
14080 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14081 if (InVectors.size() == 2)
14082 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14083 else
14084 Cost += createShuffle(Vec, nullptr, CommonMask);
14085 transformMaskAfterShuffle(CommonMask, CommonMask);
14086 // Add subvectors permutation cost.
14087 if (!SubVectorsMask.empty()) {
14088 assert(SubVectorsMask.size() <= CommonMask.size() &&
14089 "Expected same size of masks for subvectors and common mask.");
14090 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14091 copy(SubVectorsMask, SVMask.begin());
14092 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14093 if (I2 != PoisonMaskElem) {
14094 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14095 I1 = I2 + CommonMask.size();
14096 }
14097 }
14099 getWidenedType(ScalarTy, CommonMask.size()),
14100 SVMask, CostKind);
14101 }
14102 for (auto [E, Idx] : SubVectors) {
14103 Type *EScalarTy = E->Scalars.front()->getType();
14104 bool IsSigned = true;
14105 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14106 EScalarTy =
14107 IntegerType::get(EScalarTy->getContext(), It->second.first);
14108 IsSigned = It->second.second;
14109 }
14110 if (ScalarTy != EScalarTy) {
14111 unsigned CastOpcode = Instruction::Trunc;
14112 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14113 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14114 if (DstSz > SrcSz)
14115 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14116 Cost += TTI.getCastInstrCost(
14117 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14118 getWidenedType(EScalarTy, E->getVectorFactor()),
14120 }
14123 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14124 getWidenedType(ScalarTy, E->getVectorFactor()));
14125 if (!CommonMask.empty()) {
14126 std::iota(std::next(CommonMask.begin(), Idx),
14127 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14128 Idx);
14129 }
14130 }
14131 }
14132
14133 if (!ExtMask.empty()) {
14134 if (CommonMask.empty()) {
14135 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14136 } else {
14137 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14138 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14139 if (ExtMask[I] == PoisonMaskElem)
14140 continue;
14141 NewMask[I] = CommonMask[ExtMask[I]];
14142 }
14143 CommonMask.swap(NewMask);
14144 }
14145 }
14146 if (CommonMask.empty()) {
14147 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14148 return Cost;
14149 }
14150 return Cost +
14151 createShuffle(InVectors.front(),
14152 InVectors.size() == 2 ? InVectors.back() : nullptr,
14153 CommonMask);
14154 }
14155
14157 assert((IsFinalized || CommonMask.empty()) &&
14158 "Shuffle construction must be finalized.");
14159 }
14160};
14161
14162const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14163 unsigned Idx) const {
14164 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14165 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14166 return Op;
14167}
14168
14169TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14170 if (TE.State == TreeEntry::ScatterVectorize ||
14171 TE.State == TreeEntry::StridedVectorize)
14173 if (TE.State == TreeEntry::CompressVectorize)
14175 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14176 !TE.isAltShuffle()) {
14177 if (TE.ReorderIndices.empty())
14179 SmallVector<int> Mask;
14180 inversePermutation(TE.ReorderIndices, Mask);
14181 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14183 }
14185}
14186
14188BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14189 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14190 ArrayRef<Value *> VL = E->Scalars;
14191
14192 Type *ScalarTy = getValueType(VL[0]);
14193 if (!isValidElementType(ScalarTy))
14196
14197 // If we have computed a smaller type for the expression, update VecTy so
14198 // that the costs will be accurate.
14199 auto It = MinBWs.find(E);
14200 Type *OrigScalarTy = ScalarTy;
14201 if (It != MinBWs.end()) {
14202 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14203 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14204 if (VecTy)
14205 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14206 }
14207 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14208 unsigned EntryVF = E->getVectorFactor();
14209 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14210
14211 if (E->isGather()) {
14212 if (allConstant(VL))
14213 return 0;
14214 if (isa<InsertElementInst>(VL[0]))
14216 if (isa<CmpInst>(VL.front()))
14217 ScalarTy = VL.front()->getType();
14218 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14219 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14220 }
14221 if (E->State == TreeEntry::SplitVectorize) {
14222 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14223 "Expected exactly 2 combined entries.");
14224 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14225 InstructionCost VectorCost = 0;
14226 if (E->ReorderIndices.empty()) {
14227 VectorCost = ::getShuffleCost(
14228 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14229 E->CombinedEntriesWithIndices.back().second,
14231 ScalarTy,
14232 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14233 ->getVectorFactor()));
14234 } else {
14235 unsigned CommonVF =
14236 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14237 ->getVectorFactor(),
14238 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14239 ->getVectorFactor());
14240 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14241 getWidenedType(ScalarTy, CommonVF),
14242 E->getSplitMask(), CostKind);
14243 }
14244 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14245 return VectorCost;
14246 }
14247 InstructionCost CommonCost = 0;
14248 SmallVector<int> Mask;
14249 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14250 (E->State != TreeEntry::StridedVectorize ||
14251 !isReverseOrder(E->ReorderIndices))) {
14252 SmallVector<int> NewMask;
14253 if (E->getOpcode() == Instruction::Store) {
14254 // For stores the order is actually a mask.
14255 NewMask.resize(E->ReorderIndices.size());
14256 copy(E->ReorderIndices, NewMask.begin());
14257 } else {
14258 inversePermutation(E->ReorderIndices, NewMask);
14259 }
14260 ::addMask(Mask, NewMask);
14261 }
14262 if (!E->ReuseShuffleIndices.empty())
14263 ::addMask(Mask, E->ReuseShuffleIndices);
14264 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14265 CommonCost =
14266 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14267 assert((E->State == TreeEntry::Vectorize ||
14268 E->State == TreeEntry::ScatterVectorize ||
14269 E->State == TreeEntry::StridedVectorize ||
14270 E->State == TreeEntry::CompressVectorize) &&
14271 "Unhandled state");
14272 assert(E->getOpcode() &&
14273 ((allSameType(VL) && allSameBlock(VL)) ||
14274 (E->getOpcode() == Instruction::GetElementPtr &&
14275 E->getMainOp()->getType()->isPointerTy()) ||
14276 E->hasCopyableElements()) &&
14277 "Invalid VL");
14278 Instruction *VL0 = E->getMainOp();
14279 unsigned ShuffleOrOp =
14280 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14281 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14282 ShuffleOrOp = E->CombinedOp;
14283 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
14284 const unsigned Sz = UniqueValues.size();
14285 SmallBitVector UsedScalars(Sz, false);
14286 for (unsigned I = 0; I < Sz; ++I) {
14287 if (isa<Instruction>(UniqueValues[I]) &&
14288 !E->isCopyableElement(UniqueValues[I]) &&
14289 getTreeEntries(UniqueValues[I]).front() == E)
14290 continue;
14291 UsedScalars.set(I);
14292 }
14293 auto GetCastContextHint = [&](Value *V) {
14294 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14295 return getCastContextHint(*OpTEs.front());
14296 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14297 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14298 !SrcState.isAltShuffle())
14301 };
14302 auto GetCostDiff =
14303 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14304 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14305 // Calculate the cost of this instruction.
14306 InstructionCost ScalarCost = 0;
14307 if (isa<CastInst, CallInst>(VL0)) {
14308 // For some of the instructions no need to calculate cost for each
14309 // particular instruction, we can use the cost of the single
14310 // instruction x total number of scalar instructions.
14311 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14312 } else {
14313 for (unsigned I = 0; I < Sz; ++I) {
14314 if (UsedScalars.test(I))
14315 continue;
14316 ScalarCost += ScalarEltCost(I);
14317 }
14318 }
14319
14320 InstructionCost VecCost = VectorCost(CommonCost);
14321 // Check if the current node must be resized, if the parent node is not
14322 // resized.
14323 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14324 E->Idx != 0 &&
14325 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14326 const EdgeInfo &EI = E->UserTreeIndex;
14327 if (!EI.UserTE->hasState() ||
14328 EI.UserTE->getOpcode() != Instruction::Select ||
14329 EI.EdgeIdx != 0) {
14330 auto UserBWIt = MinBWs.find(EI.UserTE);
14331 Type *UserScalarTy =
14332 (EI.UserTE->isGather() ||
14333 EI.UserTE->State == TreeEntry::SplitVectorize)
14334 ? EI.UserTE->Scalars.front()->getType()
14335 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14336 if (UserBWIt != MinBWs.end())
14337 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14338 UserBWIt->second.first);
14339 if (ScalarTy != UserScalarTy) {
14340 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14341 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14342 unsigned VecOpcode;
14343 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14344 if (BWSz > SrcBWSz)
14345 VecOpcode = Instruction::Trunc;
14346 else
14347 VecOpcode =
14348 It->second.second ? Instruction::SExt : Instruction::ZExt;
14349 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14350 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14351 CostKind);
14352 }
14353 }
14354 }
14355 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14356 ScalarCost, "Calculated costs for Tree"));
14357 return VecCost - ScalarCost;
14358 };
14359 // Calculate cost difference from vectorizing set of GEPs.
14360 // Negative value means vectorizing is profitable.
14361 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14362 assert((E->State == TreeEntry::Vectorize ||
14363 E->State == TreeEntry::StridedVectorize ||
14364 E->State == TreeEntry::CompressVectorize) &&
14365 "Entry state expected to be Vectorize, StridedVectorize or "
14366 "MaskedLoadCompressVectorize here.");
14367 InstructionCost ScalarCost = 0;
14368 InstructionCost VecCost = 0;
14369 std::tie(ScalarCost, VecCost) = getGEPCosts(
14370 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14371 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14372 "Calculated GEPs cost for Tree"));
14373
14374 return VecCost - ScalarCost;
14375 };
14376
14377 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14378 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14379 if (MinMaxID == Intrinsic::not_intrinsic)
14381 Type *CanonicalType = Ty;
14382 if (CanonicalType->isPtrOrPtrVectorTy())
14383 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14384 CanonicalType->getContext(),
14385 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14386
14387 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14388 {CanonicalType, CanonicalType});
14390 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14391 // If the selects are the only uses of the compares, they will be
14392 // dead and we can adjust the cost by removing their cost.
14393 if (VI && SelectOnly) {
14394 assert((!Ty->isVectorTy() || SLPReVec) &&
14395 "Expected only for scalar type.");
14396 auto *CI = cast<CmpInst>(VI->getOperand(0));
14397 IntrinsicCost -= TTI->getCmpSelInstrCost(
14398 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14399 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14400 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14401 }
14402 return IntrinsicCost;
14403 };
14404 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14405 Instruction *VI) {
14406 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14407 return Cost;
14408 };
14409 switch (ShuffleOrOp) {
14410 case Instruction::PHI: {
14411 // Count reused scalars.
14412 InstructionCost ScalarCost = 0;
14413 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14414 for (Value *V : UniqueValues) {
14415 auto *PHI = dyn_cast<PHINode>(V);
14416 if (!PHI)
14417 continue;
14418
14419 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14420 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14421 Value *Op = PHI->getIncomingValue(I);
14422 Operands[I] = Op;
14423 }
14424 if (const TreeEntry *OpTE =
14425 getSameValuesTreeEntry(Operands.front(), Operands))
14426 if (CountedOps.insert(OpTE).second &&
14427 !OpTE->ReuseShuffleIndices.empty())
14428 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14429 OpTE->Scalars.size());
14430 }
14431
14432 return CommonCost - ScalarCost;
14433 }
14434 case Instruction::ExtractValue:
14435 case Instruction::ExtractElement: {
14436 APInt DemandedElts;
14437 VectorType *SrcVecTy = nullptr;
14438 auto GetScalarCost = [&](unsigned Idx) {
14439 if (isa<PoisonValue>(UniqueValues[Idx]))
14441
14442 auto *I = cast<Instruction>(UniqueValues[Idx]);
14443 if (!SrcVecTy) {
14444 if (ShuffleOrOp == Instruction::ExtractElement) {
14445 auto *EE = cast<ExtractElementInst>(I);
14446 SrcVecTy = EE->getVectorOperandType();
14447 } else {
14448 auto *EV = cast<ExtractValueInst>(I);
14449 Type *AggregateTy = EV->getAggregateOperand()->getType();
14450 unsigned NumElts;
14451 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14452 NumElts = ATy->getNumElements();
14453 else
14454 NumElts = AggregateTy->getStructNumElements();
14455 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
14456 }
14457 }
14458 if (I->hasOneUse()) {
14459 Instruction *Ext = I->user_back();
14460 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14462 // Use getExtractWithExtendCost() to calculate the cost of
14463 // extractelement/ext pair.
14464 InstructionCost Cost = TTI->getExtractWithExtendCost(
14465 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
14466 CostKind);
14467 // Subtract the cost of s|zext which is subtracted separately.
14468 Cost -= TTI->getCastInstrCost(
14469 Ext->getOpcode(), Ext->getType(), I->getType(),
14471 return Cost;
14472 }
14473 }
14474 if (DemandedElts.isZero())
14475 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
14476 DemandedElts.setBit(*getExtractIndex(I));
14478 };
14479 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14480 return CommonCost - (DemandedElts.isZero()
14482 : TTI.getScalarizationOverhead(
14483 SrcVecTy, DemandedElts, /*Insert=*/false,
14484 /*Extract=*/true, CostKind));
14485 };
14486 return GetCostDiff(GetScalarCost, GetVectorCost);
14487 }
14488 case Instruction::InsertElement: {
14489 assert(E->ReuseShuffleIndices.empty() &&
14490 "Unique insertelements only are expected.");
14491 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
14492 unsigned const NumElts = SrcVecTy->getNumElements();
14493 unsigned const NumScalars = VL.size();
14494
14495 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
14496
14497 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14498 unsigned OffsetBeg = *getElementIndex(VL.front());
14499 unsigned OffsetEnd = OffsetBeg;
14500 InsertMask[OffsetBeg] = 0;
14501 for (auto [I, V] : enumerate(VL.drop_front())) {
14502 unsigned Idx = *getElementIndex(V);
14503 if (OffsetBeg > Idx)
14504 OffsetBeg = Idx;
14505 else if (OffsetEnd < Idx)
14506 OffsetEnd = Idx;
14507 InsertMask[Idx] = I + 1;
14508 }
14509 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
14510 if (NumOfParts > 0 && NumOfParts < NumElts)
14511 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14512 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14513 VecScalarsSz;
14514 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14515 unsigned InsertVecSz = std::min<unsigned>(
14516 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
14517 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14518 bool IsWholeSubvector =
14519 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14520 // Check if we can safely insert a subvector. If it is not possible, just
14521 // generate a whole-sized vector and shuffle the source vector and the new
14522 // subvector.
14523 if (OffsetBeg + InsertVecSz > VecSz) {
14524 // Align OffsetBeg to generate correct mask.
14525 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
14526 InsertVecSz = VecSz;
14527 }
14528
14529 APInt DemandedElts = APInt::getZero(NumElts);
14530 // TODO: Add support for Instruction::InsertValue.
14531 SmallVector<int> Mask;
14532 if (!E->ReorderIndices.empty()) {
14533 inversePermutation(E->ReorderIndices, Mask);
14534 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
14535 } else {
14536 Mask.assign(VecSz, PoisonMaskElem);
14537 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
14538 }
14539 bool IsIdentity = true;
14540 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
14541 Mask.swap(PrevMask);
14542 for (unsigned I = 0; I < NumScalars; ++I) {
14543 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
14544 DemandedElts.setBit(InsertIdx);
14545 IsIdentity &= InsertIdx - OffsetBeg == I;
14546 Mask[InsertIdx - OffsetBeg] = I;
14547 }
14548 assert(Offset < NumElts && "Failed to find vector index offset");
14549
14551 Cost -=
14552 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
14553 /*Insert*/ true, /*Extract*/ false, CostKind);
14554
14555 // First cost - resize to actual vector size if not identity shuffle or
14556 // need to shift the vector.
14557 // Do not calculate the cost if the actual size is the register size and
14558 // we can merge this shuffle with the following SK_Select.
14559 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
14560 if (!IsIdentity)
14562 InsertVecTy, Mask);
14563 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14564 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14565 }));
14566 // Second cost - permutation with subvector, if some elements are from the
14567 // initial vector or inserting a subvector.
14568 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
14569 // subvector of ActualVecTy.
14570 SmallBitVector InMask =
14571 isUndefVector(FirstInsert->getOperand(0),
14572 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14573 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
14574 if (InsertVecSz != VecSz) {
14575 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
14576 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
14577 CostKind, OffsetBeg - Offset, InsertVecTy);
14578 } else {
14579 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
14580 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
14581 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
14582 I <= End; ++I)
14583 if (Mask[I] != PoisonMaskElem)
14584 Mask[I] = I + VecSz;
14585 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
14586 Mask[I] =
14587 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
14588 Cost +=
14589 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
14590 }
14591 }
14592 return Cost;
14593 }
14594 case Instruction::ZExt:
14595 case Instruction::SExt:
14596 case Instruction::FPToUI:
14597 case Instruction::FPToSI:
14598 case Instruction::FPExt:
14599 case Instruction::PtrToInt:
14600 case Instruction::IntToPtr:
14601 case Instruction::SIToFP:
14602 case Instruction::UIToFP:
14603 case Instruction::Trunc:
14604 case Instruction::FPTrunc:
14605 case Instruction::BitCast: {
14606 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14607 Type *SrcScalarTy = VL0->getOperand(0)->getType();
14608 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
14609 unsigned Opcode = ShuffleOrOp;
14610 unsigned VecOpcode = Opcode;
14611 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14612 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14613 // Check if the values are candidates to demote.
14614 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
14615 if (SrcIt != MinBWs.end()) {
14616 SrcBWSz = SrcIt->second.first;
14617 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
14618 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
14619 SrcVecTy =
14620 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
14621 }
14622 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14623 if (BWSz == SrcBWSz) {
14624 VecOpcode = Instruction::BitCast;
14625 } else if (BWSz < SrcBWSz) {
14626 VecOpcode = Instruction::Trunc;
14627 } else if (It != MinBWs.end()) {
14628 assert(BWSz > SrcBWSz && "Invalid cast!");
14629 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14630 } else if (SrcIt != MinBWs.end()) {
14631 assert(BWSz > SrcBWSz && "Invalid cast!");
14632 VecOpcode =
14633 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14634 }
14635 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14636 !SrcIt->second.second) {
14637 VecOpcode = Instruction::UIToFP;
14638 }
14639 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
14640 assert(Idx == 0 && "Expected 0 index only");
14641 return TTI->getCastInstrCost(Opcode, VL0->getType(),
14642 VL0->getOperand(0)->getType(),
14644 };
14645 auto GetVectorCost = [=](InstructionCost CommonCost) {
14646 // Do not count cost here if minimum bitwidth is in effect and it is just
14647 // a bitcast (here it is just a noop).
14648 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14649 return CommonCost;
14650 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
14651 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
14652
14653 bool IsArithmeticExtendedReduction =
14654 E->Idx == 0 && UserIgnoreList &&
14655 all_of(*UserIgnoreList, [](Value *V) {
14656 auto *I = cast<Instruction>(V);
14657 return is_contained({Instruction::Add, Instruction::FAdd,
14658 Instruction::Mul, Instruction::FMul,
14659 Instruction::And, Instruction::Or,
14660 Instruction::Xor},
14661 I->getOpcode());
14662 });
14663 if (IsArithmeticExtendedReduction &&
14664 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14665 return CommonCost;
14666 return CommonCost +
14667 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
14668 VecOpcode == Opcode ? VI : nullptr);
14669 };
14670 return GetCostDiff(GetScalarCost, GetVectorCost);
14671 }
14672 case Instruction::FCmp:
14673 case Instruction::ICmp:
14674 case Instruction::Select: {
14675 CmpPredicate VecPred, SwappedVecPred;
14676 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
14677 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
14678 match(VL0, MatchCmp))
14679 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
14680 else
14681 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
14684 auto GetScalarCost = [&](unsigned Idx) {
14685 if (isa<PoisonValue>(UniqueValues[Idx]))
14687
14688 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14689 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
14692 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
14693 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
14694 !match(VI, MatchCmp)) ||
14695 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
14696 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
14697 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
14700
14701 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
14702 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14703 CostKind, getOperandInfo(VI->getOperand(0)),
14704 getOperandInfo(VI->getOperand(1)), VI);
14705 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
14706 if (IntrinsicCost.isValid())
14707 ScalarCost = IntrinsicCost;
14708
14709 return ScalarCost;
14710 };
14711 auto GetVectorCost = [&](InstructionCost CommonCost) {
14712 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
14713
14714 InstructionCost VecCost =
14715 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
14716 CostKind, getOperandInfo(E->getOperand(0)),
14717 getOperandInfo(E->getOperand(1)), VL0);
14718 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
14719 auto *CondType =
14720 getWidenedType(SI->getCondition()->getType(), VL.size());
14721 unsigned CondNumElements = CondType->getNumElements();
14722 unsigned VecTyNumElements = getNumElements(VecTy);
14723 assert(VecTyNumElements >= CondNumElements &&
14724 VecTyNumElements % CondNumElements == 0 &&
14725 "Cannot vectorize Instruction::Select");
14726 if (CondNumElements != VecTyNumElements) {
14727 // When the return type is i1 but the source is fixed vector type, we
14728 // need to duplicate the condition value.
14729 VecCost += ::getShuffleCost(
14730 *TTI, TTI::SK_PermuteSingleSrc, CondType,
14731 createReplicatedMask(VecTyNumElements / CondNumElements,
14732 CondNumElements));
14733 }
14734 }
14735 return VecCost + CommonCost;
14736 };
14737 return GetCostDiff(GetScalarCost, GetVectorCost);
14738 }
14739 case TreeEntry::MinMax: {
14740 auto GetScalarCost = [&](unsigned Idx) {
14741 return GetMinMaxCost(OrigScalarTy);
14742 };
14743 auto GetVectorCost = [&](InstructionCost CommonCost) {
14744 InstructionCost VecCost = GetMinMaxCost(VecTy);
14745 return VecCost + CommonCost;
14746 };
14747 return GetCostDiff(GetScalarCost, GetVectorCost);
14748 }
14749 case TreeEntry::FMulAdd: {
14750 auto GetScalarCost = [&](unsigned Idx) {
14751 if (isa<PoisonValue>(UniqueValues[Idx]))
14753 return GetFMulAddCost(E->getOperations(),
14754 cast<Instruction>(UniqueValues[Idx]));
14755 };
14756 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14757 FastMathFlags FMF;
14758 FMF.set();
14759 for (Value *V : E->Scalars) {
14760 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
14761 FMF &= FPCI->getFastMathFlags();
14762 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
14763 FMF &= FPCIOp->getFastMathFlags();
14764 }
14765 }
14766 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14767 {VecTy, VecTy, VecTy}, FMF);
14768 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
14769 return VecCost + CommonCost;
14770 };
14771 return GetCostDiff(GetScalarCost, GetVectorCost);
14772 }
14773 case Instruction::FNeg:
14774 case Instruction::Add:
14775 case Instruction::FAdd:
14776 case Instruction::Sub:
14777 case Instruction::FSub:
14778 case Instruction::Mul:
14779 case Instruction::FMul:
14780 case Instruction::UDiv:
14781 case Instruction::SDiv:
14782 case Instruction::FDiv:
14783 case Instruction::URem:
14784 case Instruction::SRem:
14785 case Instruction::FRem:
14786 case Instruction::Shl:
14787 case Instruction::LShr:
14788 case Instruction::AShr:
14789 case Instruction::And:
14790 case Instruction::Or:
14791 case Instruction::Xor: {
14792 auto GetScalarCost = [&](unsigned Idx) {
14793 if (isa<PoisonValue>(UniqueValues[Idx]))
14795
14796 // We cannot retrieve the operand from UniqueValues[Idx] because an
14797 // interchangeable instruction may be used. The order and the actual
14798 // operand might differ from what is retrieved from UniqueValues[Idx].
14799 Value *Op1 = E->getOperand(0)[Idx];
14800 Value *Op2;
14801 SmallVector<const Value *, 2> Operands(1, Op1);
14802 if (isa<UnaryOperator>(UniqueValues[Idx])) {
14803 Op2 = Op1;
14804 } else {
14805 Op2 = E->getOperand(1)[Idx];
14806 Operands.push_back(Op2);
14807 }
14810 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
14811 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
14812 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
14813 I && (ShuffleOrOp == Instruction::FAdd ||
14814 ShuffleOrOp == Instruction::FSub)) {
14815 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
14816 if (IntrinsicCost.isValid())
14817 ScalarCost = IntrinsicCost;
14818 }
14819 return ScalarCost;
14820 };
14821 auto GetVectorCost = [=](InstructionCost CommonCost) {
14822 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14823 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
14824 ArrayRef<Value *> Ops = E->getOperand(I);
14825 if (all_of(Ops, [&](Value *Op) {
14826 auto *CI = dyn_cast<ConstantInt>(Op);
14827 return CI && CI->getValue().countr_one() >= It->second.first;
14828 }))
14829 return CommonCost;
14830 }
14831 }
14832 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
14833 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
14834 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
14835 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
14836 Op2Info, {}, nullptr, TLI) +
14837 CommonCost;
14838 };
14839 return GetCostDiff(GetScalarCost, GetVectorCost);
14840 }
14841 case Instruction::GetElementPtr: {
14842 return CommonCost + GetGEPCostDiff(VL, VL0);
14843 }
14844 case Instruction::Load: {
14845 auto GetScalarCost = [&](unsigned Idx) {
14846 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
14847 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14848 VI->getAlign(), VI->getPointerAddressSpace(),
14850 };
14851 auto *LI0 = cast<LoadInst>(VL0);
14852 auto GetVectorCost = [&](InstructionCost CommonCost) {
14853 InstructionCost VecLdCost;
14854 switch (E->State) {
14855 case TreeEntry::Vectorize:
14856 if (unsigned Factor = E->getInterleaveFactor()) {
14857 VecLdCost = TTI->getInterleavedMemoryOpCost(
14858 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14859 LI0->getPointerAddressSpace(), CostKind);
14860
14861 } else {
14862 VecLdCost = TTI->getMemoryOpCost(
14863 Instruction::Load, VecTy, LI0->getAlign(),
14864 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14865 }
14866 break;
14867 case TreeEntry::StridedVectorize: {
14868 Align CommonAlignment =
14869 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14870 VecLdCost = TTI->getStridedMemoryOpCost(
14871 Instruction::Load, VecTy, LI0->getPointerOperand(),
14872 /*VariableMask=*/false, CommonAlignment, CostKind);
14873 break;
14874 }
14875 case TreeEntry::CompressVectorize: {
14876 bool IsMasked;
14877 unsigned InterleaveFactor;
14878 SmallVector<int> CompressMask;
14879 VectorType *LoadVecTy;
14880 SmallVector<Value *> Scalars(VL);
14881 if (!E->ReorderIndices.empty()) {
14882 SmallVector<int> Mask(E->ReorderIndices.begin(),
14883 E->ReorderIndices.end());
14884 reorderScalars(Scalars, Mask);
14885 }
14886 SmallVector<Value *> PointerOps(Scalars.size());
14887 for (auto [I, V] : enumerate(Scalars))
14888 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
14889 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
14890 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
14891 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
14892 CompressMask, LoadVecTy);
14893 assert(IsVectorized && "Failed to vectorize load");
14894 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
14895 InterleaveFactor, IsMasked);
14896 Align CommonAlignment = LI0->getAlign();
14897 if (InterleaveFactor) {
14898 VecLdCost = TTI->getInterleavedMemoryOpCost(
14899 Instruction::Load, LoadVecTy, InterleaveFactor, {},
14900 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
14901 } else if (IsMasked) {
14902 VecLdCost = TTI->getMaskedMemoryOpCost(
14903 Instruction::Load, LoadVecTy, CommonAlignment,
14904 LI0->getPointerAddressSpace(), CostKind);
14905 // TODO: include this cost into CommonCost.
14906 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
14907 LoadVecTy, CompressMask, CostKind);
14908 } else {
14909 VecLdCost = TTI->getMemoryOpCost(
14910 Instruction::Load, LoadVecTy, CommonAlignment,
14911 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
14912 // TODO: include this cost into CommonCost.
14913 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
14914 LoadVecTy, CompressMask, CostKind);
14915 }
14916 break;
14917 }
14918 case TreeEntry::ScatterVectorize: {
14919 Align CommonAlignment =
14920 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
14921 VecLdCost = TTI->getGatherScatterOpCost(
14922 Instruction::Load, VecTy, LI0->getPointerOperand(),
14923 /*VariableMask=*/false, CommonAlignment, CostKind);
14924 break;
14925 }
14926 case TreeEntry::CombinedVectorize:
14927 case TreeEntry::SplitVectorize:
14928 case TreeEntry::NeedToGather:
14929 llvm_unreachable("Unexpected vectorization state.");
14930 }
14931 return VecLdCost + CommonCost;
14932 };
14933
14934 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
14935 // If this node generates masked gather load then it is not a terminal node.
14936 // Hence address operand cost is estimated separately.
14937 if (E->State == TreeEntry::ScatterVectorize)
14938 return Cost;
14939
14940 // Estimate cost of GEPs since this tree node is a terminator.
14941 SmallVector<Value *> PointerOps(VL.size());
14942 for (auto [I, V] : enumerate(VL))
14943 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
14944 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
14945 }
14946 case Instruction::Store: {
14947 bool IsReorder = !E->ReorderIndices.empty();
14948 auto GetScalarCost = [=](unsigned Idx) {
14949 auto *VI = cast<StoreInst>(VL[Idx]);
14950 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
14951 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
14952 VI->getAlign(), VI->getPointerAddressSpace(),
14953 CostKind, OpInfo, VI);
14954 };
14955 auto *BaseSI =
14956 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
14957 auto GetVectorCost = [=](InstructionCost CommonCost) {
14958 // We know that we can merge the stores. Calculate the cost.
14959 InstructionCost VecStCost;
14960 if (E->State == TreeEntry::StridedVectorize) {
14961 Align CommonAlignment =
14962 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
14963 VecStCost = TTI->getStridedMemoryOpCost(
14964 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
14965 /*VariableMask=*/false, CommonAlignment, CostKind);
14966 } else {
14967 assert(E->State == TreeEntry::Vectorize &&
14968 "Expected either strided or consecutive stores.");
14969 if (unsigned Factor = E->getInterleaveFactor()) {
14970 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
14971 "No reused shuffles expected");
14972 CommonCost = 0;
14973 VecStCost = TTI->getInterleavedMemoryOpCost(
14974 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
14975 BaseSI->getPointerAddressSpace(), CostKind);
14976 } else {
14977 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
14978 VecStCost = TTI->getMemoryOpCost(
14979 Instruction::Store, VecTy, BaseSI->getAlign(),
14980 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
14981 }
14982 }
14983 return VecStCost + CommonCost;
14984 };
14985 SmallVector<Value *> PointerOps(VL.size());
14986 for (auto [I, V] : enumerate(VL)) {
14987 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
14988 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
14989 }
14990
14991 return GetCostDiff(GetScalarCost, GetVectorCost) +
14992 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
14993 }
14994 case Instruction::Call: {
14995 auto GetScalarCost = [&](unsigned Idx) {
14996 auto *CI = cast<CallInst>(UniqueValues[Idx]);
14999 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15000 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15001 }
15002 return TTI->getCallInstrCost(CI->getCalledFunction(),
15004 CI->getFunctionType()->params(), CostKind);
15005 };
15006 auto GetVectorCost = [=](InstructionCost CommonCost) {
15007 auto *CI = cast<CallInst>(VL0);
15010 CI, ID, VecTy->getNumElements(),
15011 It != MinBWs.end() ? It->second.first : 0, TTI);
15012 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15013 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15014 };
15015 return GetCostDiff(GetScalarCost, GetVectorCost);
15016 }
15017 case Instruction::ShuffleVector: {
15018 if (!SLPReVec || E->isAltShuffle())
15019 assert(E->isAltShuffle() &&
15020 ((Instruction::isBinaryOp(E->getOpcode()) &&
15021 Instruction::isBinaryOp(E->getAltOpcode())) ||
15022 (Instruction::isCast(E->getOpcode()) &&
15023 Instruction::isCast(E->getAltOpcode())) ||
15024 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15025 "Invalid Shuffle Vector Operand");
15026 // Try to find the previous shuffle node with the same operands and same
15027 // main/alternate ops.
15028 auto TryFindNodeWithEqualOperands = [=]() {
15029 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15030 if (TE.get() == E)
15031 break;
15032 if (TE->hasState() && TE->isAltShuffle() &&
15033 ((TE->getOpcode() == E->getOpcode() &&
15034 TE->getAltOpcode() == E->getAltOpcode()) ||
15035 (TE->getOpcode() == E->getAltOpcode() &&
15036 TE->getAltOpcode() == E->getOpcode())) &&
15037 TE->hasEqualOperands(*E))
15038 return true;
15039 }
15040 return false;
15041 };
15042 auto GetScalarCost = [&](unsigned Idx) {
15043 if (isa<PoisonValue>(UniqueValues[Idx]))
15045
15046 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15047 assert(E->getMatchingMainOpOrAltOp(VI) &&
15048 "Unexpected main/alternate opcode");
15049 (void)E;
15050 return TTI->getInstructionCost(VI, CostKind);
15051 };
15052 // Need to clear CommonCost since the final shuffle cost is included into
15053 // vector cost.
15054 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15055 // VecCost is equal to sum of the cost of creating 2 vectors
15056 // and the cost of creating shuffle.
15057 InstructionCost VecCost = 0;
15058 if (TryFindNodeWithEqualOperands()) {
15059 LLVM_DEBUG({
15060 dbgs() << "SLP: diamond match for alternate node found.\n";
15061 E->dump();
15062 });
15063 // No need to add new vector costs here since we're going to reuse
15064 // same main/alternate vector ops, just do different shuffling.
15065 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15066 VecCost =
15067 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15068 VecCost +=
15069 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15070 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15071 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15072 VecCost = TTIRef.getCmpSelInstrCost(
15073 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15074 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15075 VL0);
15076 VecCost += TTIRef.getCmpSelInstrCost(
15077 E->getOpcode(), VecTy, MaskTy,
15078 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15079 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15080 E->getAltOp());
15081 } else {
15082 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15083 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15084 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15085 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15086 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15087 unsigned SrcBWSz =
15088 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15089 if (SrcIt != MinBWs.end()) {
15090 SrcBWSz = SrcIt->second.first;
15091 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15092 SrcTy = getWidenedType(SrcSclTy, VL.size());
15093 }
15094 if (BWSz <= SrcBWSz) {
15095 if (BWSz < SrcBWSz)
15096 VecCost =
15097 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15099 LLVM_DEBUG({
15100 dbgs()
15101 << "SLP: alternate extension, which should be truncated.\n";
15102 E->dump();
15103 });
15104 return VecCost;
15105 }
15106 }
15107 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15109 VecCost +=
15110 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15112 }
15113 SmallVector<int> Mask;
15114 E->buildAltOpShuffleMask(
15115 [&](Instruction *I) {
15116 assert(E->getMatchingMainOpOrAltOp(I) &&
15117 "Unexpected main/alternate opcode");
15118 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15119 *TLI);
15120 },
15121 Mask);
15123 FinalVecTy, Mask, CostKind);
15124 // Patterns like [fadd,fsub] can be combined into a single instruction
15125 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15126 // need to take into account their order when looking for the most used
15127 // order.
15128 unsigned Opcode0 = E->getOpcode();
15129 unsigned Opcode1 = E->getAltOpcode();
15130 SmallBitVector OpcodeMask(
15131 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15132 // If this pattern is supported by the target then we consider the
15133 // order.
15134 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15135 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15136 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15137 return AltVecCost < VecCost ? AltVecCost : VecCost;
15138 }
15139 // TODO: Check the reverse order too.
15140 return VecCost;
15141 };
15142 if (SLPReVec && !E->isAltShuffle())
15143 return GetCostDiff(
15144 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15145 // If a group uses mask in order, the shufflevector can be
15146 // eliminated by instcombine. Then the cost is 0.
15148 "Not supported shufflevector usage.");
15149 auto *SV = cast<ShuffleVectorInst>(VL.front());
15150 unsigned SVNumElements =
15151 cast<FixedVectorType>(SV->getOperand(0)->getType())
15152 ->getNumElements();
15153 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15154 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15155 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15156 int NextIndex = 0;
15157 if (!all_of(Group, [&](Value *V) {
15159 "Not supported shufflevector usage.");
15160 auto *SV = cast<ShuffleVectorInst>(V);
15161 int Index;
15162 [[maybe_unused]] bool IsExtractSubvectorMask =
15163 SV->isExtractSubvectorMask(Index);
15164 assert(IsExtractSubvectorMask &&
15165 "Not supported shufflevector usage.");
15166 if (NextIndex != Index)
15167 return false;
15168 NextIndex += SV->getShuffleMask().size();
15169 return true;
15170 }))
15171 return ::getShuffleCost(
15173 calculateShufflevectorMask(E->Scalars));
15174 }
15175 return TTI::TCC_Free;
15176 });
15177 return GetCostDiff(GetScalarCost, GetVectorCost);
15178 }
15179 case Instruction::Freeze:
15180 return CommonCost;
15181 default:
15182 llvm_unreachable("Unknown instruction");
15183 }
15184}
15185
15186bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15187 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15188 << VectorizableTree.size() << " is fully vectorizable .\n");
15189
15190 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15191 SmallVector<int> Mask;
15192 return TE->isGather() &&
15193 !any_of(TE->Scalars,
15194 [this](Value *V) { return EphValues.contains(V); }) &&
15195 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15196 TE->Scalars.size() < Limit ||
15197 (((TE->hasState() &&
15198 TE->getOpcode() == Instruction::ExtractElement) ||
15200 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15201 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15202 !TE->isAltShuffle()) ||
15203 any_of(TE->Scalars, IsaPred<LoadInst>));
15204 };
15205
15206 // We only handle trees of heights 1 and 2.
15207 if (VectorizableTree.size() == 1 &&
15208 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15209 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15210 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15211 (ForReduction &&
15212 AreVectorizableGathers(VectorizableTree[0].get(),
15213 VectorizableTree[0]->Scalars.size()) &&
15214 VectorizableTree[0]->getVectorFactor() > 2)))
15215 return true;
15216
15217 if (VectorizableTree.size() != 2)
15218 return false;
15219
15220 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15221 // with the second gather nodes if they have less scalar operands rather than
15222 // the initial tree element (may be profitable to shuffle the second gather)
15223 // or they are extractelements, which form shuffle.
15224 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15225 AreVectorizableGathers(VectorizableTree[1].get(),
15226 VectorizableTree[0]->Scalars.size()))
15227 return true;
15228
15229 // Gathering cost would be too much for tiny trees.
15230 if (VectorizableTree[0]->isGather() ||
15231 (VectorizableTree[1]->isGather() &&
15232 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15233 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15234 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15235 return false;
15236
15237 return true;
15238}
15239
15240static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15242 bool MustMatchOrInst) {
15243 // Look past the root to find a source value. Arbitrarily follow the
15244 // path through operand 0 of any 'or'. Also, peek through optional
15245 // shift-left-by-multiple-of-8-bits.
15246 Value *ZextLoad = Root;
15247 const APInt *ShAmtC;
15248 bool FoundOr = false;
15249 while (!isa<ConstantExpr>(ZextLoad) &&
15250 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15251 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15252 ShAmtC->urem(8) == 0))) {
15253 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15254 ZextLoad = BinOp->getOperand(0);
15255 if (BinOp->getOpcode() == Instruction::Or)
15256 FoundOr = true;
15257 }
15258 // Check if the input is an extended load of the required or/shift expression.
15259 Value *Load;
15260 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15261 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15262 return false;
15263
15264 // Require that the total load bit width is a legal integer type.
15265 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15266 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15267 Type *SrcTy = Load->getType();
15268 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15269 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15270 return false;
15271
15272 // Everything matched - assume that we can fold the whole sequence using
15273 // load combining.
15274 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15275 << *(cast<Instruction>(Root)) << "\n");
15276
15277 return true;
15278}
15279
15281 if (RdxKind != RecurKind::Or)
15282 return false;
15283
15284 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15285 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15286 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15287 /* MatchOr */ false);
15288}
15289
15291 // Peek through a final sequence of stores and check if all operations are
15292 // likely to be load-combined.
15293 unsigned NumElts = Stores.size();
15294 for (Value *Scalar : Stores) {
15295 Value *X;
15296 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15297 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15298 return false;
15299 }
15300 return true;
15301}
15302
15303bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15304 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15305 return true;
15306
15307 // Graph is empty - do nothing.
15308 if (VectorizableTree.empty()) {
15309 assert(ExternalUses.empty() && "We shouldn't have any external users");
15310
15311 return true;
15312 }
15313
15314 // No need to vectorize inserts of gathered values.
15315 if (VectorizableTree.size() == 2 &&
15316 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15317 VectorizableTree[1]->isGather() &&
15318 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15319 !(isSplat(VectorizableTree[1]->Scalars) ||
15320 allConstant(VectorizableTree[1]->Scalars))))
15321 return true;
15322
15323 // If the graph includes only PHI nodes and gathers, it is defnitely not
15324 // profitable for the vectorization, we can skip it, if the cost threshold is
15325 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15326 // gathers/buildvectors.
15327 constexpr int Limit = 4;
15328 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15329 !VectorizableTree.empty() &&
15330 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15331 return (TE->isGather() &&
15332 (!TE->hasState() ||
15333 TE->getOpcode() != Instruction::ExtractElement) &&
15334 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15335 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15336 }))
15337 return true;
15338
15339 // Do not vectorize small tree of phis only, if all vector phis are also
15340 // gathered.
15341 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15342 VectorizableTree.size() <= Limit &&
15343 all_of(VectorizableTree,
15344 [&](const std::unique_ptr<TreeEntry> &TE) {
15345 return (TE->isGather() &&
15346 (!TE->hasState() ||
15347 TE->getOpcode() != Instruction::ExtractElement) &&
15348 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15349 Limit) ||
15350 (TE->hasState() &&
15351 (TE->getOpcode() == Instruction::InsertElement ||
15352 (TE->getOpcode() == Instruction::PHI &&
15353 all_of(TE->Scalars, [&](Value *V) {
15354 return isa<PoisonValue>(V) || MustGather.contains(V);
15355 }))));
15356 }) &&
15357 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15358 return TE->State == TreeEntry::Vectorize &&
15359 TE->getOpcode() == Instruction::PHI;
15360 }))
15361 return true;
15362
15363 // If the tree contains only phis, buildvectors, split nodes and
15364 // small nodes with reuses, we can skip it.
15365 SmallVector<const TreeEntry *> StoreLoadNodes;
15366 unsigned NumGathers = 0;
15367 constexpr int LimitTreeSize = 36;
15368 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15369 all_of(VectorizableTree,
15370 [&](const std::unique_ptr<TreeEntry> &TE) {
15371 if (!TE->isGather() && TE->hasState() &&
15372 (TE->getOpcode() == Instruction::Load ||
15373 TE->getOpcode() == Instruction::Store)) {
15374 StoreLoadNodes.push_back(TE.get());
15375 return true;
15376 }
15377 if (TE->isGather())
15378 ++NumGathers;
15379 return TE->State == TreeEntry::SplitVectorize ||
15380 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15381 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15382 VectorizableTree.size() > LimitTreeSize) ||
15383 (TE->isGather() &&
15384 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15385 (TE->hasState() &&
15386 (TE->getOpcode() == Instruction::PHI ||
15387 (TE->hasCopyableElements() &&
15388 static_cast<unsigned>(count_if(
15389 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15390 TE->Scalars.size() / 2) ||
15391 ((!TE->ReuseShuffleIndices.empty() ||
15392 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15393 TE->Scalars.size() == 2)));
15394 }) &&
15395 (StoreLoadNodes.empty() ||
15396 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15397 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15398 return TE->getOpcode() == Instruction::Store ||
15399 all_of(TE->Scalars, [&](Value *V) {
15400 return !isa<LoadInst>(V) ||
15401 areAllUsersVectorized(cast<Instruction>(V));
15402 });
15403 })))))
15404 return true;
15405
15406 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15407 // tree node) and other buildvectors, we can skip it.
15408 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15409 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15410 VectorizableTree.size() >= Limit &&
15411 count_if(ArrayRef(VectorizableTree).drop_front(),
15412 [&](const std::unique_ptr<TreeEntry> &TE) {
15413 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15414 TE->UserTreeIndex.UserTE->Idx == 0;
15415 }) == 2)
15416 return true;
15417
15418 // If the tree contains only vectorization of the phi node from the
15419 // buildvector - skip it.
15420 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15421 VectorizableTree.size() > 2 &&
15422 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15423 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15424 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15425 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15426 all_of(
15427 ArrayRef(VectorizableTree).drop_front(2),
15428 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
15429 return true;
15430
15431 // We can vectorize the tree if its size is greater than or equal to the
15432 // minimum size specified by the MinTreeSize command line option.
15433 if (VectorizableTree.size() >= MinTreeSize)
15434 return false;
15435
15436 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
15437 // can vectorize it if we can prove it fully vectorizable.
15438 if (isFullyVectorizableTinyTree(ForReduction))
15439 return false;
15440
15441 // Check if any of the gather node forms an insertelement buildvector
15442 // somewhere.
15443 bool IsAllowedSingleBVNode =
15444 VectorizableTree.size() > 1 ||
15445 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15446 !VectorizableTree.front()->isAltShuffle() &&
15447 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15448 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15449 allSameBlock(VectorizableTree.front()->Scalars));
15450 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15451 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
15452 return isa<ExtractElementInst, Constant>(V) ||
15453 (IsAllowedSingleBVNode &&
15454 !V->hasNUsesOrMore(UsesLimit) &&
15455 any_of(V->users(), IsaPred<InsertElementInst>));
15456 });
15457 }))
15458 return false;
15459
15460 if (VectorizableTree.back()->isGather() &&
15461 VectorizableTree.back()->hasState() &&
15462 VectorizableTree.back()->isAltShuffle() &&
15463 VectorizableTree.back()->getVectorFactor() > 2 &&
15464 allSameBlock(VectorizableTree.back()->Scalars) &&
15465 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15466 TTI->getScalarizationOverhead(
15467 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15468 VectorizableTree.back()->getVectorFactor()),
15469 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
15470 /*Insert=*/true, /*Extract=*/false,
15472 return false;
15473
15474 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
15475 // vectorizable.
15476 return true;
15477}
15478
15481 constexpr unsigned SmallTree = 3;
15482 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15483 getCanonicalGraphSize() <= SmallTree &&
15484 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15485 [](const std::unique_ptr<TreeEntry> &TE) {
15486 return TE->isGather() && TE->hasState() &&
15487 TE->getOpcode() == Instruction::Load &&
15488 !allSameBlock(TE->Scalars);
15489 }) == 1)
15490 return true;
15491 return false;
15492 }
15493 bool Res = false;
15494 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
15495 TreeEntry &E = *VectorizableTree[Idx];
15496 if (E.State == TreeEntry::SplitVectorize)
15497 return false;
15498 if (!E.isGather())
15499 continue;
15500 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15501 (!E.hasState() &&
15503 (isa<ExtractElementInst>(E.Scalars.front()) &&
15504 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
15505 return false;
15506 if (isSplat(E.Scalars) || allConstant(E.Scalars))
15507 continue;
15508 Res = true;
15509 }
15510 return Res;
15511}
15512
15514 // Walk from the bottom of the tree to the top, tracking which values are
15515 // live. When we see a call instruction that is not part of our tree,
15516 // query TTI to see if there is a cost to keeping values live over it
15517 // (for example, if spills and fills are required).
15518
15519 const TreeEntry *Root = VectorizableTree.front().get();
15520 if (Root->isGather())
15521 return 0;
15522
15525 EntriesToOperands;
15526 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
15527 SmallPtrSet<const Instruction *, 8> LastInstructions;
15528 for (const auto &TEPtr : VectorizableTree) {
15529 if (!TEPtr->isGather()) {
15530 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15531 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
15532 LastInstructions.insert(LastInst);
15533 }
15534 if (TEPtr->UserTreeIndex)
15535 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15536 }
15537
15538 auto NoCallIntrinsic = [this](const Instruction *I) {
15539 const auto *II = dyn_cast<IntrinsicInst>(I);
15540 if (!II)
15541 return false;
15542 if (II->isAssumeLikeIntrinsic())
15543 return true;
15544 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
15545 InstructionCost IntrCost =
15546 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
15547 InstructionCost CallCost = TTI->getCallInstrCost(
15548 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
15549 return IntrCost < CallCost;
15550 };
15551
15552 // Maps last instruction in the entry to the last instruction for the one of
15553 // operand entries and the flag. If the flag is true, there are no calls in
15554 // between these instructions.
15556 CheckedInstructions;
15557 unsigned Budget = 0;
15558 const unsigned BudgetLimit =
15559 ScheduleRegionSizeBudget / VectorizableTree.size();
15560 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
15561 const Instruction *Last) {
15562 assert(First->getParent() == Last->getParent() &&
15563 "Expected instructions in same block.");
15564 if (auto It = CheckedInstructions.find(Last);
15565 It != CheckedInstructions.end()) {
15566 const Instruction *Checked = It->second.getPointer();
15567 if (Checked == First || Checked->comesBefore(First))
15568 return It->second.getInt() != 0;
15569 Last = Checked;
15570 } else if (Last == First || Last->comesBefore(First)) {
15571 return true;
15572 }
15574 ++First->getIterator().getReverse(),
15575 PrevInstIt =
15576 Last->getIterator().getReverse();
15577 SmallVector<const Instruction *> LastInstsInRange;
15578 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15579 // Debug information does not impact spill cost.
15580 // Vectorized calls, represented as vector intrinsics, do not impact spill
15581 // cost.
15582 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15583 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
15584 for (const Instruction *LastInst : LastInstsInRange)
15585 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
15586 return false;
15587 }
15588 if (LastInstructions.contains(&*PrevInstIt))
15589 LastInstsInRange.push_back(&*PrevInstIt);
15590
15591 ++PrevInstIt;
15592 ++Budget;
15593 }
15594 for (const Instruction *LastInst : LastInstsInRange)
15595 CheckedInstructions.try_emplace(
15596 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
15597 Budget <= BudgetLimit ? 1 : 0);
15598 return Budget <= BudgetLimit;
15599 };
15600 auto AddCosts = [&](const TreeEntry *Op) {
15601 Type *ScalarTy = Op->Scalars.front()->getType();
15602 auto It = MinBWs.find(Op);
15603 if (It != MinBWs.end())
15604 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
15605 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
15606 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15607 if (ScalarTy->isVectorTy()) {
15608 // Handle revec dead vector instructions.
15609 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15610 }
15611 };
15612 // Memoize the relationship between blocks, i.e. if there is (at least one)
15613 // non-vectorized call between the blocks. This allows to skip the analysis of
15614 // the same block paths multiple times.
15616 ParentOpParentToPreds;
15617 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
15618 BasicBlock *OpParent) {
15619 auto Key = std::make_pair(Root, OpParent);
15620 if (auto It = ParentOpParentToPreds.find(Key);
15621 It != ParentOpParentToPreds.end())
15622 return It->second;
15624 if (Pred)
15625 Worklist.push_back(Pred);
15626 else
15627 Worklist.append(pred_begin(Root), pred_end(Root));
15630 ParentsPairsToAdd;
15631 bool Res = false;
15632 auto Cleanup = make_scope_exit([&]() {
15633 for (const auto &KeyPair : ParentsPairsToAdd) {
15634 assert(!ParentOpParentToPreds.contains(KeyPair) &&
15635 "Should not have been added before.");
15636 ParentOpParentToPreds.try_emplace(KeyPair, Res);
15637 }
15638 });
15639 while (!Worklist.empty()) {
15640 BasicBlock *BB = Worklist.pop_back_val();
15641 if (BB == OpParent || !Visited.insert(BB).second)
15642 continue;
15643 auto Pair = std::make_pair(BB, OpParent);
15644 if (auto It = ParentOpParentToPreds.find(Pair);
15645 It != ParentOpParentToPreds.end()) {
15646 Res = It->second;
15647 return Res;
15648 }
15649 ParentsPairsToAdd.insert(Pair);
15650 unsigned BlockSize = BB->size();
15651 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
15652 return Res;
15653 Budget += BlockSize;
15654 if (Budget > BudgetLimit)
15655 return Res;
15656 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
15657 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
15658 BB->getTerminator()))
15659 return Res;
15660 Worklist.append(pred_begin(BB), pred_end(BB));
15661 }
15662 Res = true;
15663 return Res;
15664 };
15665 SmallVector<const TreeEntry *> LiveEntries(1, Root);
15666 while (!LiveEntries.empty()) {
15667 const TreeEntry *Entry = LiveEntries.pop_back_val();
15668 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
15669 if (Operands.empty())
15670 continue;
15671 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
15672 BasicBlock *Parent = LastInst->getParent();
15673 for (const TreeEntry *Op : Operands) {
15674 if (!Op->isGather())
15675 LiveEntries.push_back(Op);
15676 if (Entry->State == TreeEntry::SplitVectorize ||
15677 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
15678 (Op->isGather() && allConstant(Op->Scalars)))
15679 continue;
15680 Budget = 0;
15681 BasicBlock *Pred = nullptr;
15682 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15683 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15684 BasicBlock *OpParent;
15685 Instruction *OpLastInst;
15686 if (Op->isGather()) {
15687 assert(Entry->getOpcode() == Instruction::PHI &&
15688 "Expected phi node only.");
15689 OpParent = cast<PHINode>(Entry->getMainOp())
15690 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15691 OpLastInst = OpParent->getTerminator();
15692 for (Value *V : Op->Scalars) {
15693 auto *Inst = dyn_cast<Instruction>(V);
15694 if (!Inst)
15695 continue;
15696 if (isVectorized(V)) {
15697 OpParent = Inst->getParent();
15698 OpLastInst = Inst;
15699 break;
15700 }
15701 }
15702 } else {
15703 OpLastInst = EntriesToLastInstruction.at(Op);
15704 OpParent = OpLastInst->getParent();
15705 }
15706 // Check the call instructions within the same basic blocks.
15707 if (OpParent == Parent) {
15708 if (Entry->getOpcode() == Instruction::PHI) {
15709 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15710 AddCosts(Op);
15711 continue;
15712 }
15713 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15714 AddCosts(Op);
15715 continue;
15716 }
15717 // Check for call instruction in between blocks.
15718 // 1. Check entry's block to the head.
15719 if (Entry->getOpcode() != Instruction::PHI &&
15720 !CheckForNonVecCallsInSameBlock(
15721 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
15722 LastInst)) {
15723 AddCosts(Op);
15724 continue;
15725 }
15726 // 2. Check op's block from the end.
15727 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15728 OpParent->getTerminator())) {
15729 AddCosts(Op);
15730 continue;
15731 }
15732 // 3. Check the predecessors of entry's block till op's block.
15733 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15734 AddCosts(Op);
15735 continue;
15736 }
15737 }
15738 }
15739
15740 return Cost;
15741}
15742
15743/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
15744/// buildvector sequence.
15746 const InsertElementInst *IE2) {
15747 if (IE1 == IE2)
15748 return false;
15749 const auto *I1 = IE1;
15750 const auto *I2 = IE2;
15751 const InsertElementInst *PrevI1;
15752 const InsertElementInst *PrevI2;
15753 unsigned Idx1 = *getElementIndex(IE1);
15754 unsigned Idx2 = *getElementIndex(IE2);
15755 do {
15756 if (I2 == IE1)
15757 return true;
15758 if (I1 == IE2)
15759 return false;
15760 PrevI1 = I1;
15761 PrevI2 = I2;
15762 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15763 getElementIndex(I1).value_or(Idx2) != Idx2)
15764 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
15765 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
15766 getElementIndex(I2).value_or(Idx1) != Idx1)
15767 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
15768 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15769 llvm_unreachable("Two different buildvectors not expected.");
15770}
15771
15772namespace {
15773/// Returns incoming Value *, if the requested type is Value * too, or a default
15774/// value, otherwise.
15775struct ValueSelect {
15776 template <typename U>
15777 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
15778 return V;
15779 }
15780 template <typename U>
15781 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
15782 return U();
15783 }
15784};
15785} // namespace
15786
15787/// Does the analysis of the provided shuffle masks and performs the requested
15788/// actions on the vectors with the given shuffle masks. It tries to do it in
15789/// several steps.
15790/// 1. If the Base vector is not undef vector, resizing the very first mask to
15791/// have common VF and perform action for 2 input vectors (including non-undef
15792/// Base). Other shuffle masks are combined with the resulting after the 1 stage
15793/// and processed as a shuffle of 2 elements.
15794/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
15795/// action only for 1 vector with the given mask, if it is not the identity
15796/// mask.
15797/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
15798/// vectors, combing the masks properly between the steps.
15799template <typename T>
15801 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
15802 function_ref<unsigned(T *)> GetVF,
15803 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
15805 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
15806 SmallVector<int> Mask(ShuffleMask.begin()->second);
15807 auto VMIt = std::next(ShuffleMask.begin());
15808 T *Prev = nullptr;
15809 SmallBitVector UseMask =
15810 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15811 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
15812 if (!IsBaseUndef.all()) {
15813 // Base is not undef, need to combine it with the next subvectors.
15814 std::pair<T *, bool> Res =
15815 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
15816 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
15817 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15818 if (Mask[Idx] == PoisonMaskElem)
15819 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
15820 else
15821 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15822 }
15823 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
15824 assert((!V || GetVF(V) == Mask.size()) &&
15825 "Expected base vector of VF number of elements.");
15826 Prev = Action(Mask, {nullptr, Res.first});
15827 } else if (ShuffleMask.size() == 1) {
15828 // Base is undef and only 1 vector is shuffled - perform the action only for
15829 // single vector, if the mask is not the identity mask.
15830 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15831 /*ForSingleMask=*/true);
15832 if (Res.second)
15833 // Identity mask is found.
15834 Prev = Res.first;
15835 else
15836 Prev = Action(Mask, {ShuffleMask.begin()->first});
15837 } else {
15838 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
15839 // shuffles step by step, combining shuffle between the steps.
15840 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15841 unsigned Vec2VF = GetVF(VMIt->first);
15842 if (Vec1VF == Vec2VF) {
15843 // No need to resize the input vectors since they are of the same size, we
15844 // can shuffle them directly.
15845 ArrayRef<int> SecMask = VMIt->second;
15846 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15847 if (SecMask[I] != PoisonMaskElem) {
15848 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15849 Mask[I] = SecMask[I] + Vec1VF;
15850 }
15851 }
15852 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15853 } else {
15854 // Vectors of different sizes - resize and reshuffle.
15855 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15856 /*ForSingleMask=*/false);
15857 std::pair<T *, bool> Res2 =
15858 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15859 ArrayRef<int> SecMask = VMIt->second;
15860 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15861 if (Mask[I] != PoisonMaskElem) {
15862 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15863 if (Res1.second)
15864 Mask[I] = I;
15865 } else if (SecMask[I] != PoisonMaskElem) {
15866 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15867 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
15868 }
15869 }
15870 Prev = Action(Mask, {Res1.first, Res2.first});
15871 }
15872 VMIt = std::next(VMIt);
15873 }
15874 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
15875 // Perform requested actions for the remaining masks/vectors.
15876 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
15877 // Shuffle other input vectors, if any.
15878 std::pair<T *, bool> Res =
15879 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15880 ArrayRef<int> SecMask = VMIt->second;
15881 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15882 if (SecMask[I] != PoisonMaskElem) {
15883 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
15884 "Multiple uses of scalars.");
15885 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
15886 } else if (Mask[I] != PoisonMaskElem) {
15887 Mask[I] = I;
15888 }
15889 }
15890 Prev = Action(Mask, {Prev, Res.first});
15891 }
15892 return Prev;
15893}
15894
15895namespace {
15896/// Data type for handling buildvector sequences with the reused scalars from
15897/// other tree entries.
15898template <typename T> struct ShuffledInsertData {
15899 /// List of insertelements to be replaced by shuffles.
15900 SmallVector<InsertElementInst *> InsertElements;
15901 /// The parent vectors and shuffle mask for the given list of inserts.
15902 MapVector<T, SmallVector<int>> ValueMasks;
15903};
15904} // namespace
15905
15907 InstructionCost ReductionCost) {
15908 InstructionCost Cost = ReductionCost;
15909 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
15910 << VectorizableTree.size() << ".\n");
15911
15912 SmallPtrSet<Value *, 4> CheckedExtracts;
15913 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
15914 TreeEntry &TE = *VectorizableTree[I];
15915 // No need to count the cost for combined entries, they are combined and
15916 // just skip their cost.
15917 if (TE.State == TreeEntry::CombinedVectorize) {
15918 LLVM_DEBUG(
15919 dbgs() << "SLP: Skipping cost for combined node that starts with "
15920 << *TE.Scalars[0] << ".\n";
15921 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
15922 continue;
15923 }
15924 if (TE.hasState() &&
15925 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
15926 if (const TreeEntry *E =
15927 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
15928 E && E->getVectorFactor() == TE.getVectorFactor()) {
15929 // Some gather nodes might be absolutely the same as some vectorizable
15930 // nodes after reordering, need to handle it.
15931 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
15932 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
15933 << "SLP: Current total cost = " << Cost << "\n");
15934 continue;
15935 }
15936 }
15937
15938 // Exclude cost of gather loads nodes which are not used. These nodes were
15939 // built as part of the final attempt to vectorize gathered loads.
15940 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
15941 "Expected gather nodes with users only.");
15942
15943 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
15944 Cost += C;
15945 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
15946 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
15947 << "SLP: Current total cost = " << Cost << "\n");
15948 }
15949
15950 if (Cost >= -SLPCostThreshold &&
15951 none_of(ExternalUses, [](const ExternalUser &EU) {
15952 return isa_and_nonnull<InsertElementInst>(EU.User);
15953 }))
15954 return Cost;
15955
15956 SmallPtrSet<Value *, 16> ExtractCostCalculated;
15957 InstructionCost ExtractCost = 0;
15959 SmallVector<APInt> DemandedElts;
15960 SmallDenseSet<Value *, 4> UsedInserts;
15962 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
15964 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
15965 // Keep track {Scalar, Index, User} tuple.
15966 // On AArch64, this helps in fusing a mov instruction, associated with
15967 // extractelement, with fmul in the backend so that extractelement is free.
15969 for (ExternalUser &EU : ExternalUses) {
15970 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
15971 }
15972 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
15973 for (ExternalUser &EU : ExternalUses) {
15974 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
15975 << EU.E.Idx << " in lane " << EU.Lane << "\n");
15976 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
15977 else dbgs() << " User: nullptr\n");
15978 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
15979
15980 // Uses by ephemeral values are free (because the ephemeral value will be
15981 // removed prior to code generation, and so the extraction will be
15982 // removed as well).
15983 if (EphValues.count(EU.User))
15984 continue;
15985
15986 // Check if the scalar for the given user or all users is accounted already.
15987 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
15988 (EU.User &&
15989 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
15990 continue;
15991
15992 // Used in unreachable blocks or in EH pads (rarely executed) or is
15993 // terminated with unreachable instruction.
15994 if (BasicBlock *UserParent =
15995 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
15996 UserParent &&
15997 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
15998 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
15999 continue;
16000
16001 // We only add extract cost once for the same scalar.
16002 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16003 !ExtractCostCalculated.insert(EU.Scalar).second)
16004 continue;
16005
16006 // No extract cost for vector "scalar" if REVEC is disabled
16007 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16008 continue;
16009
16010 // If found user is an insertelement, do not calculate extract cost but try
16011 // to detect it as a final shuffled/identity match.
16012 // TODO: what if a user is insertvalue when REVEC is enabled?
16013 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16014 VU && VU->getOperand(1) == EU.Scalar) {
16015 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16016 if (!UsedInserts.insert(VU).second)
16017 continue;
16018 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16019 if (InsertIdx) {
16020 const TreeEntry *ScalarTE = &EU.E;
16021 auto *It = find_if(
16022 ShuffledInserts,
16023 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16024 // Checks if 2 insertelements are from the same buildvector.
16025 InsertElementInst *VecInsert = Data.InsertElements.front();
16027 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16028 Value *Op0 = II->getOperand(0);
16029 if (isVectorized(II) && !isVectorized(Op0))
16030 return nullptr;
16031 return Op0;
16032 });
16033 });
16034 int VecId = -1;
16035 if (It == ShuffledInserts.end()) {
16036 auto &Data = ShuffledInserts.emplace_back();
16037 Data.InsertElements.emplace_back(VU);
16038 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16039 VecId = ShuffledInserts.size() - 1;
16040 auto It = MinBWs.find(ScalarTE);
16041 if (It != MinBWs.end() &&
16042 VectorCasts
16043 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16044 .second) {
16045 unsigned BWSz = It->second.first;
16046 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16047 unsigned VecOpcode;
16048 if (DstBWSz < BWSz)
16049 VecOpcode = Instruction::Trunc;
16050 else
16051 VecOpcode =
16052 It->second.second ? Instruction::SExt : Instruction::ZExt;
16054 InstructionCost C = TTI->getCastInstrCost(
16055 VecOpcode, FTy,
16056 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16057 FTy->getNumElements()),
16059 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16060 << " for extending externally used vector with "
16061 "non-equal minimum bitwidth.\n");
16062 Cost += C;
16063 }
16064 } else {
16065 if (isFirstInsertElement(VU, It->InsertElements.front()))
16066 It->InsertElements.front() = VU;
16067 VecId = std::distance(ShuffledInserts.begin(), It);
16068 }
16069 int InIdx = *InsertIdx;
16070 SmallVectorImpl<int> &Mask =
16071 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16072 if (Mask.empty())
16073 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16074 Mask[InIdx] = EU.Lane;
16075 DemandedElts[VecId].setBit(InIdx);
16076 continue;
16077 }
16078 }
16079 }
16080
16082 // If we plan to rewrite the tree in a smaller type, we will need to sign
16083 // extend the extracted value back to the original type. Here, we account
16084 // for the extract and the added cost of the sign extend if needed.
16085 InstructionCost ExtraCost = TTI::TCC_Free;
16086 auto *ScalarTy = EU.Scalar->getType();
16087 const unsigned BundleWidth = EU.E.getVectorFactor();
16088 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16089 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16090 const TreeEntry *Entry = &EU.E;
16091 auto It = MinBWs.find(Entry);
16092 if (It != MinBWs.end()) {
16093 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16094 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16095 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16096 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16097 ? Instruction::ZExt
16098 : Instruction::SExt;
16099 VecTy = getWidenedType(MinTy, BundleWidth);
16100 ExtraCost =
16101 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16102 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16103 << ExtraCost << "\n");
16104 } else {
16105 ExtraCost =
16106 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16107 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16108 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16109 << *VecTy << ": " << ExtraCost << "\n");
16110 }
16111 // Leave the scalar instructions as is if they are cheaper than extracts.
16112 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16113 Entry->getOpcode() == Instruction::Load) {
16114 // Checks if the user of the external scalar is phi in loop body.
16115 auto IsPhiInLoop = [&](const ExternalUser &U) {
16116 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16117 auto *I = cast<Instruction>(U.Scalar);
16118 const Loop *L = LI->getLoopFor(Phi->getParent());
16119 return L && (Phi->getParent() == I->getParent() ||
16120 L == LI->getLoopFor(I->getParent()));
16121 }
16122 return false;
16123 };
16124 if (!ValueToExtUses) {
16125 ValueToExtUses.emplace();
16126 for (const auto &P : enumerate(ExternalUses)) {
16127 // Ignore phis in loops.
16128 if (IsPhiInLoop(P.value()))
16129 continue;
16130
16131 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16132 }
16133 }
16134 // Can use original instruction, if no operands vectorized or they are
16135 // marked as externally used already.
16136 auto *Inst = cast<Instruction>(EU.Scalar);
16137 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16138 auto OperandIsScalar = [&](Value *V) {
16139 if (!isVectorized(V)) {
16140 // Some extractelements might be not vectorized, but
16141 // transformed into shuffle and removed from the function,
16142 // consider it here.
16143 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16144 return !EE->hasOneUse() || !MustGather.contains(EE);
16145 return true;
16146 }
16147 return ValueToExtUses->contains(V);
16148 };
16149 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16150 bool CanBeUsedAsScalarCast = false;
16151 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16152 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16153 Op && all_of(Op->operands(), OperandIsScalar)) {
16154 InstructionCost OpCost =
16155 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16156 ? TTI->getInstructionCost(Op, CostKind)
16157 : 0;
16158 if (ScalarCost + OpCost <= ExtraCost) {
16159 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16160 ScalarCost += OpCost;
16161 }
16162 }
16163 }
16164 if (CanBeUsedAsScalar) {
16165 bool KeepScalar = ScalarCost <= ExtraCost;
16166 // Try to keep original scalar if the user is the phi node from the same
16167 // block as the root phis, currently vectorized. It allows to keep
16168 // better ordering info of PHIs, being vectorized currently.
16169 bool IsProfitablePHIUser =
16170 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16171 VectorizableTree.front()->Scalars.size() > 2)) &&
16172 VectorizableTree.front()->hasState() &&
16173 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16174 !Inst->hasNUsesOrMore(UsesLimit) &&
16175 none_of(Inst->users(),
16176 [&](User *U) {
16177 auto *PHIUser = dyn_cast<PHINode>(U);
16178 return (!PHIUser ||
16179 PHIUser->getParent() !=
16180 cast<Instruction>(
16181 VectorizableTree.front()->getMainOp())
16182 ->getParent()) &&
16183 !isVectorized(U);
16184 }) &&
16185 count_if(Entry->Scalars, [&](Value *V) {
16186 return ValueToExtUses->contains(V);
16187 }) <= 2;
16188 if (IsProfitablePHIUser) {
16189 KeepScalar = true;
16190 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16191 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16192 (!GatheredLoadsEntriesFirst.has_value() ||
16193 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16194 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16195 return ValueToExtUses->contains(V);
16196 });
16197 auto It = ExtractsCount.find(Entry);
16198 if (It != ExtractsCount.end()) {
16199 assert(ScalarUsesCount >= It->getSecond().size() &&
16200 "Expected total number of external uses not less than "
16201 "number of scalar uses.");
16202 ScalarUsesCount -= It->getSecond().size();
16203 }
16204 // Keep original scalar if number of externally used instructions in
16205 // the same entry is not power of 2. It may help to do some extra
16206 // vectorization for now.
16207 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16208 }
16209 if (KeepScalar) {
16210 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16211 for (Value *V : Inst->operands()) {
16212 auto It = ValueToExtUses->find(V);
16213 if (It != ValueToExtUses->end()) {
16214 // Replace all uses to avoid compiler crash.
16215 ExternalUses[It->second].User = nullptr;
16216 }
16217 }
16218 ExtraCost = ScalarCost;
16219 if (!IsPhiInLoop(EU))
16220 ExtractsCount[Entry].insert(Inst);
16221 if (CanBeUsedAsScalarCast) {
16222 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16223 // Update the users of the operands of the cast operand to avoid
16224 // compiler crash.
16225 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16226 for (Value *V : IOp->operands()) {
16227 auto It = ValueToExtUses->find(V);
16228 if (It != ValueToExtUses->end()) {
16229 // Replace all uses to avoid compiler crash.
16230 ExternalUses[It->second].User = nullptr;
16231 }
16232 }
16233 }
16234 }
16235 }
16236 }
16237 }
16238
16239 ExtractCost += ExtraCost;
16240 }
16241 // Insert externals for extract of operands of casts to be emitted as scalars
16242 // instead of extractelement.
16243 for (Value *V : ScalarOpsFromCasts) {
16244 ExternalUsesAsOriginalScalar.insert(V);
16245 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
16246 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
16247 TEs.front()->findLaneForValue(V));
16248 }
16249 }
16250 // Add reduced value cost, if resized.
16251 if (!VectorizedVals.empty()) {
16252 const TreeEntry &Root = *VectorizableTree.front();
16253 auto BWIt = MinBWs.find(&Root);
16254 if (BWIt != MinBWs.end()) {
16255 Type *DstTy = Root.Scalars.front()->getType();
16256 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
16257 unsigned SrcSz =
16258 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16259 if (OriginalSz != SrcSz) {
16260 unsigned Opcode = Instruction::Trunc;
16261 if (OriginalSz > SrcSz)
16262 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16263 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
16264 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16265 assert(SLPReVec && "Only supported by REVEC.");
16266 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
16267 }
16268 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16271 }
16272 }
16273 }
16274
16275 Cost += ExtractCost;
16276 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
16277 bool ForSingleMask) {
16278 InstructionCost C = 0;
16279 unsigned VF = Mask.size();
16280 unsigned VecVF = TE->getVectorFactor();
16281 bool HasLargeIndex =
16282 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
16283 if ((VF != VecVF && HasLargeIndex) ||
16285
16286 if (HasLargeIndex) {
16287 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
16288 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16289 OrigMask.begin());
16291 getWidenedType(TE->getMainOp()->getType(), VecVF),
16292 OrigMask);
16293 LLVM_DEBUG(
16294 dbgs() << "SLP: Adding cost " << C
16295 << " for final shuffle of insertelement external users.\n";
16296 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16297 Cost += C;
16298 return std::make_pair(TE, true);
16299 }
16300
16301 if (!ForSingleMask) {
16302 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16303 for (unsigned I = 0; I < VF; ++I) {
16304 if (Mask[I] != PoisonMaskElem)
16305 ResizeMask[Mask[I]] = Mask[I];
16306 }
16307 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
16310 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
16311 LLVM_DEBUG(
16312 dbgs() << "SLP: Adding cost " << C
16313 << " for final shuffle of insertelement external users.\n";
16314 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16315
16316 Cost += C;
16317 }
16318 }
16319 return std::make_pair(TE, false);
16320 };
16321 // Calculate the cost of the reshuffled vectors, if any.
16322 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16323 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
16324 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16325 unsigned VF = 0;
16326 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
16328 assert((TEs.size() == 1 || TEs.size() == 2) &&
16329 "Expected exactly 1 or 2 tree entries.");
16330 if (TEs.size() == 1) {
16331 if (VF == 0)
16332 VF = TEs.front()->getVectorFactor();
16333 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16334 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
16335 !all_of(enumerate(Mask), [=](const auto &Data) {
16336 return Data.value() == PoisonMaskElem ||
16337 (Data.index() < VF &&
16338 static_cast<int>(Data.index()) == Data.value());
16339 })) {
16342 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16343 << " for final shuffle of insertelement "
16344 "external users.\n";
16345 TEs.front()->dump();
16346 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16347 Cost += C;
16348 }
16349 } else {
16350 if (VF == 0) {
16351 if (TEs.front() &&
16352 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16353 VF = TEs.front()->getVectorFactor();
16354 else
16355 VF = Mask.size();
16356 }
16357 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16359 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
16360 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16361 << " for final shuffle of vector node and external "
16362 "insertelement users.\n";
16363 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16364 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16365 Cost += C;
16366 }
16367 VF = Mask.size();
16368 return TEs.back();
16369 };
16371 MutableArrayRef(Vector.data(), Vector.size()), Base,
16372 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16373 EstimateShufflesCost);
16374 InstructionCost InsertCost = TTI->getScalarizationOverhead(
16376 ShuffledInserts[I].InsertElements.front()->getType()),
16377 DemandedElts[I],
16378 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
16379 Cost -= InsertCost;
16380 }
16381
16382 // Add the cost for reduced value resize (if required).
16383 if (ReductionBitWidth != 0) {
16384 assert(UserIgnoreList && "Expected reduction tree.");
16385 const TreeEntry &E = *VectorizableTree.front();
16386 auto It = MinBWs.find(&E);
16387 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16388 unsigned SrcSize = It->second.first;
16389 unsigned DstSize = ReductionBitWidth;
16390 unsigned Opcode = Instruction::Trunc;
16391 if (SrcSize < DstSize) {
16392 bool IsArithmeticExtendedReduction =
16393 all_of(*UserIgnoreList, [](Value *V) {
16394 auto *I = cast<Instruction>(V);
16395 return is_contained({Instruction::Add, Instruction::FAdd,
16396 Instruction::Mul, Instruction::FMul,
16397 Instruction::And, Instruction::Or,
16398 Instruction::Xor},
16399 I->getOpcode());
16400 });
16401 if (IsArithmeticExtendedReduction)
16402 Opcode =
16403 Instruction::BitCast; // Handle it by getExtendedReductionCost
16404 else
16405 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16406 }
16407 if (Opcode != Instruction::BitCast) {
16408 auto *SrcVecTy =
16409 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16410 auto *DstVecTy =
16411 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16412 TTI::CastContextHint CCH = getCastContextHint(E);
16413 InstructionCost CastCost;
16414 switch (E.getOpcode()) {
16415 case Instruction::SExt:
16416 case Instruction::ZExt:
16417 case Instruction::Trunc: {
16418 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16419 CCH = getCastContextHint(*OpTE);
16420 break;
16421 }
16422 default:
16423 break;
16424 }
16425 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16427 Cost += CastCost;
16428 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
16429 << " for final resize for reduction from " << SrcVecTy
16430 << " to " << DstVecTy << "\n";
16431 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16432 }
16433 }
16434 }
16435
16436 std::optional<InstructionCost> SpillCost;
16437 if (Cost < -SLPCostThreshold) {
16438 SpillCost = getSpillCost();
16439 Cost += *SpillCost;
16440 }
16441#ifndef NDEBUG
16442 SmallString<256> Str;
16443 {
16444 raw_svector_ostream OS(Str);
16445 OS << "SLP: Spill Cost = ";
16446 if (SpillCost)
16447 OS << *SpillCost;
16448 else
16449 OS << "<skipped>";
16450 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
16451 << "SLP: Total Cost = " << Cost << ".\n";
16452 }
16453 LLVM_DEBUG(dbgs() << Str);
16454 if (ViewSLPTree)
16455 ViewGraph(this, "SLP" + F->getName(), false, Str);
16456#endif
16457
16458 return Cost;
16459}
16460
16461/// Tries to find extractelement instructions with constant indices from fixed
16462/// vector type and gather such instructions into a bunch, which highly likely
16463/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16464/// successful, the matched scalars are replaced by poison values in \p VL for
16465/// future analysis.
16466std::optional<TTI::ShuffleKind>
16467BoUpSLP::tryToGatherSingleRegisterExtractElements(
16469 // Scan list of gathered scalars for extractelements that can be represented
16470 // as shuffles.
16472 SmallVector<int> UndefVectorExtracts;
16473 for (int I = 0, E = VL.size(); I < E; ++I) {
16474 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16475 if (!EI) {
16476 if (isa<UndefValue>(VL[I]))
16477 UndefVectorExtracts.push_back(I);
16478 continue;
16479 }
16480 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16481 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16482 continue;
16483 std::optional<unsigned> Idx = getExtractIndex(EI);
16484 // Undefined index.
16485 if (!Idx) {
16486 UndefVectorExtracts.push_back(I);
16487 continue;
16488 }
16489 if (Idx >= VecTy->getNumElements()) {
16490 UndefVectorExtracts.push_back(I);
16491 continue;
16492 }
16493 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
16494 ExtractMask.reset(*Idx);
16495 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
16496 UndefVectorExtracts.push_back(I);
16497 continue;
16498 }
16499 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
16500 }
16501 // Sort the vector operands by the maximum number of uses in extractelements.
16503 VectorOpToIdx.takeVector();
16504 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
16505 return P1.second.size() > P2.second.size();
16506 });
16507 // Find the best pair of the vectors or a single vector.
16508 const int UndefSz = UndefVectorExtracts.size();
16509 unsigned SingleMax = 0;
16510 unsigned PairMax = 0;
16511 if (!Vectors.empty()) {
16512 SingleMax = Vectors.front().second.size() + UndefSz;
16513 if (Vectors.size() > 1) {
16514 auto *ItNext = std::next(Vectors.begin());
16515 PairMax = SingleMax + ItNext->second.size();
16516 }
16517 }
16518 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16519 return std::nullopt;
16520 // Check if better to perform a shuffle of 2 vectors or just of a single
16521 // vector.
16522 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
16523 SmallVector<Value *> GatheredExtracts(
16524 VL.size(), PoisonValue::get(VL.front()->getType()));
16525 if (SingleMax >= PairMax && SingleMax) {
16526 for (int Idx : Vectors.front().second)
16527 std::swap(GatheredExtracts[Idx], VL[Idx]);
16528 } else if (!Vectors.empty()) {
16529 for (unsigned Idx : {0, 1})
16530 for (int Idx : Vectors[Idx].second)
16531 std::swap(GatheredExtracts[Idx], VL[Idx]);
16532 }
16533 // Add extracts from undefs too.
16534 for (int Idx : UndefVectorExtracts)
16535 std::swap(GatheredExtracts[Idx], VL[Idx]);
16536 // Check that gather of extractelements can be represented as just a
16537 // shuffle of a single/two vectors the scalars are extracted from.
16538 std::optional<TTI::ShuffleKind> Res =
16539 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
16540 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16541 // TODO: try to check other subsets if possible.
16542 // Restore the original VL if attempt was not successful.
16543 copy(SavedVL, VL.begin());
16544 return std::nullopt;
16545 }
16546 // Restore unused scalars from mask, if some of the extractelements were not
16547 // selected for shuffle.
16548 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
16549 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
16550 isa<UndefValue>(GatheredExtracts[I])) {
16551 std::swap(VL[I], GatheredExtracts[I]);
16552 continue;
16553 }
16554 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16555 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16556 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16557 is_contained(UndefVectorExtracts, I))
16558 continue;
16559 }
16560 return Res;
16561}
16562
16563/// Tries to find extractelement instructions with constant indices from fixed
16564/// vector type and gather such instructions into a bunch, which highly likely
16565/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16566/// successful, the matched scalars are replaced by poison values in \p VL for
16567/// future analysis.
16569BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16570 SmallVectorImpl<int> &Mask,
16571 unsigned NumParts) const {
16572 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
16573 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
16574 Mask.assign(VL.size(), PoisonMaskElem);
16575 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16576 for (unsigned Part : seq<unsigned>(NumParts)) {
16577 // Scan list of gathered scalars for extractelements that can be represented
16578 // as shuffles.
16579 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
16580 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
16581 SmallVector<int> SubMask;
16582 std::optional<TTI::ShuffleKind> Res =
16583 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16584 ShufflesRes[Part] = Res;
16585 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
16586 }
16587 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
16588 return Res.has_value();
16589 }))
16590 ShufflesRes.clear();
16591 return ShufflesRes;
16592}
16593
16594std::optional<TargetTransformInfo::ShuffleKind>
16595BoUpSLP::isGatherShuffledSingleRegisterEntry(
16596 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
16597 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
16598 Entries.clear();
16599 // TODO: currently checking only for Scalars in the tree entry, need to count
16600 // reused elements too for better cost estimation.
16601 auto GetUserEntry = [&](const TreeEntry *TE) {
16602 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16603 TE = TE->UserTreeIndex.UserTE;
16604 if (TE == VectorizableTree.front().get())
16605 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
16606 return TE->UserTreeIndex;
16607 };
16608 auto HasGatherUser = [&](const TreeEntry *TE) {
16609 while (TE->Idx != 0 && TE->UserTreeIndex) {
16610 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16611 return true;
16612 TE = TE->UserTreeIndex.UserTE;
16613 }
16614 return false;
16615 };
16616 const EdgeInfo TEUseEI = GetUserEntry(TE);
16617 if (!TEUseEI)
16618 return std::nullopt;
16619 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16620 const BasicBlock *TEInsertBlock = nullptr;
16621 // Main node of PHI entries keeps the correct order of operands/incoming
16622 // blocks.
16623 if (auto *PHI = dyn_cast_or_null<PHINode>(
16624 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
16625 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16626 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16627 TEInsertPt = TEInsertBlock->getTerminator();
16628 } else {
16629 TEInsertBlock = TEInsertPt->getParent();
16630 }
16631 if (!DT->isReachableFromEntry(TEInsertBlock))
16632 return std::nullopt;
16633 auto *NodeUI = DT->getNode(TEInsertBlock);
16634 assert(NodeUI && "Should only process reachable instructions");
16635 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
16636 auto CheckOrdering = [&](const Instruction *InsertPt) {
16637 // Argument InsertPt is an instruction where vector code for some other
16638 // tree entry (one that shares one or more scalars with TE) is going to be
16639 // generated. This lambda returns true if insertion point of vector code
16640 // for the TE dominates that point (otherwise dependency is the other way
16641 // around). The other node is not limited to be of a gather kind. Gather
16642 // nodes are not scheduled and their vector code is inserted before their
16643 // first user. If user is PHI, that is supposed to be at the end of a
16644 // predecessor block. Otherwise it is the last instruction among scalars of
16645 // the user node. So, instead of checking dependency between instructions
16646 // themselves, we check dependency between their insertion points for vector
16647 // code (since each scalar instruction ends up as a lane of a vector
16648 // instruction).
16649 const BasicBlock *InsertBlock = InsertPt->getParent();
16650 auto *NodeEUI = DT->getNode(InsertBlock);
16651 if (!NodeEUI)
16652 return false;
16653 assert((NodeUI == NodeEUI) ==
16654 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16655 "Different nodes should have different DFS numbers");
16656 // Check the order of the gather nodes users.
16657 if (TEInsertPt->getParent() != InsertBlock &&
16658 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16659 return false;
16660 if (TEInsertPt->getParent() == InsertBlock &&
16661 TEInsertPt->comesBefore(InsertPt))
16662 return false;
16663 return true;
16664 };
16665 // Find all tree entries used by the gathered values. If no common entries
16666 // found - not a shuffle.
16667 // Here we build a set of tree nodes for each gathered value and trying to
16668 // find the intersection between these sets. If we have at least one common
16669 // tree node for each gathered value - we have just a permutation of the
16670 // single vector. If we have 2 different sets, we're in situation where we
16671 // have a permutation of 2 input vectors.
16673 SmallDenseMap<Value *, int> UsedValuesEntry;
16674 SmallPtrSet<const Value *, 16> VisitedValue;
16675 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
16676 // The node is reused - exit.
16677 if ((TEPtr->getVectorFactor() != VL.size() &&
16678 TEPtr->Scalars.size() != VL.size()) ||
16679 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
16680 return false;
16681 UsedTEs.clear();
16682 UsedTEs.emplace_back().insert(TEPtr);
16683 for (Value *V : VL) {
16684 if (isConstant(V))
16685 continue;
16686 UsedValuesEntry.try_emplace(V, 0);
16687 }
16688 return true;
16689 };
16690 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
16691 unsigned EdgeIdx) {
16692 const TreeEntry *Ptr1 = User1;
16693 const TreeEntry *Ptr2 = User2;
16694 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16695 while (Ptr2) {
16696 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
16697 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16698 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16699 }
16700 while (Ptr1) {
16701 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16702 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16703 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
16704 return Idx < It->second;
16705 }
16706 return false;
16707 };
16708 for (Value *V : VL) {
16709 if (isConstant(V) || !VisitedValue.insert(V).second)
16710 continue;
16711 // Build a list of tree entries where V is used.
16712 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16713 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16714 if (TEPtr == TE || TEPtr->Idx == 0)
16715 continue;
16716 assert(any_of(TEPtr->Scalars,
16717 [&](Value *V) { return GatheredScalars.contains(V); }) &&
16718 "Must contain at least single gathered value.");
16719 assert(TEPtr->UserTreeIndex &&
16720 "Expected only single user of a gather node.");
16721 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16722
16723 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16724 UseEI.UserTE->hasState())
16725 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
16726 : nullptr;
16727 Instruction *InsertPt =
16728 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
16729 : &getLastInstructionInBundle(UseEI.UserTE);
16730 if (TEInsertPt == InsertPt) {
16731 // Check nodes, which might be emitted first.
16732 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16733 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16734 TEUseEI.UserTE->isAltShuffle()) &&
16735 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
16736 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16737 (UseEI.UserTE->hasState() &&
16738 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16739 !UseEI.UserTE->isAltShuffle()) ||
16740 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
16741 continue;
16742 }
16743
16744 // If the schedulable insertion point is used in multiple entries - just
16745 // exit, no known ordering at this point, available only after real
16746 // scheduling.
16747 if (!doesNotNeedToBeScheduled(InsertPt) &&
16748 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16749 continue;
16750 // If the users are the PHI nodes with the same incoming blocks - skip.
16751 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16752 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16753 UseEI.UserTE->State == TreeEntry::Vectorize &&
16754 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16755 TEUseEI.UserTE != UseEI.UserTE)
16756 continue;
16757 // If 2 gathers are operands of the same entry (regardless of whether
16758 // user is PHI or else), compare operands indices, use the earlier one
16759 // as the base.
16760 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16761 continue;
16762 // If the user instruction is used for some reason in different
16763 // vectorized nodes - make it depend on index.
16764 if (TEUseEI.UserTE != UseEI.UserTE &&
16765 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16766 HasGatherUser(TEUseEI.UserTE)))
16767 continue;
16768 // If the user node is the operand of the other user node - skip.
16769 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16770 continue;
16771 }
16772
16773 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16774 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16775 UseEI.UserTE->doesNotNeedToSchedule() &&
16776 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
16777 continue;
16778 // Check if the user node of the TE comes after user node of TEPtr,
16779 // otherwise TEPtr depends on TE.
16780 if ((TEInsertBlock != InsertPt->getParent() ||
16781 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16782 !CheckOrdering(InsertPt))
16783 continue;
16784 // The node is reused - exit.
16785 if (CheckAndUseSameNode(TEPtr))
16786 break;
16787 VToTEs.insert(TEPtr);
16788 }
16789 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
16790 const auto *It = find_if(
16791 VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
16792 if (It != VTEs.end()) {
16793 const TreeEntry *VTE = *It;
16794 if (none_of(TE->CombinedEntriesWithIndices,
16795 [&](const auto &P) { return P.first == VTE->Idx; })) {
16796 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16797 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16798 continue;
16799 }
16800 // The node is reused - exit.
16801 if (CheckAndUseSameNode(VTE))
16802 break;
16803 VToTEs.insert(VTE);
16804 }
16805 }
16806 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
16807 const TreeEntry *VTE = VTEs.front();
16808 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16809 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16810 VTEs = VTEs.drop_front();
16811 // Iterate through all vectorized nodes.
16812 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
16813 return MTE->State == TreeEntry::Vectorize;
16814 });
16815 if (MIt == VTEs.end())
16816 continue;
16817 VTE = *MIt;
16818 }
16819 if (none_of(TE->CombinedEntriesWithIndices,
16820 [&](const auto &P) { return P.first == VTE->Idx; })) {
16821 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16822 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16823 continue;
16824 }
16825 // The node is reused - exit.
16826 if (CheckAndUseSameNode(VTE))
16827 break;
16828 VToTEs.insert(VTE);
16829 }
16830 if (VToTEs.empty())
16831 continue;
16832 if (UsedTEs.empty()) {
16833 // The first iteration, just insert the list of nodes to vector.
16834 UsedTEs.push_back(VToTEs);
16835 UsedValuesEntry.try_emplace(V, 0);
16836 } else {
16837 // Need to check if there are any previously used tree nodes which use V.
16838 // If there are no such nodes, consider that we have another one input
16839 // vector.
16840 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16841 unsigned Idx = 0;
16842 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16843 // Do we have a non-empty intersection of previously listed tree entries
16844 // and tree entries using current V?
16845 set_intersect(VToTEs, Set);
16846 if (!VToTEs.empty()) {
16847 // Yes, write the new subset and continue analysis for the next
16848 // scalar.
16849 Set.swap(VToTEs);
16850 break;
16851 }
16852 VToTEs = SavedVToTEs;
16853 ++Idx;
16854 }
16855 // No non-empty intersection found - need to add a second set of possible
16856 // source vectors.
16857 if (Idx == UsedTEs.size()) {
16858 // If the number of input vectors is greater than 2 - not a permutation,
16859 // fallback to the regular gather.
16860 // TODO: support multiple reshuffled nodes.
16861 if (UsedTEs.size() == 2)
16862 continue;
16863 UsedTEs.push_back(SavedVToTEs);
16864 Idx = UsedTEs.size() - 1;
16865 }
16866 UsedValuesEntry.try_emplace(V, Idx);
16867 }
16868 }
16869
16870 if (UsedTEs.empty()) {
16871 Entries.clear();
16872 return std::nullopt;
16873 }
16874
16875 unsigned VF = 0;
16876 if (UsedTEs.size() == 1) {
16877 // Keep the order to avoid non-determinism.
16878 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
16879 UsedTEs.front().end());
16880 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
16881 return TE1->Idx < TE2->Idx;
16882 });
16883 // Try to find the perfect match in another gather node at first.
16884 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
16885 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
16886 });
16887 if (It != FirstEntries.end() &&
16888 ((*It)->getVectorFactor() == VL.size() ||
16889 ((*It)->getVectorFactor() == TE->Scalars.size() &&
16890 TE->ReuseShuffleIndices.size() == VL.size() &&
16891 (*It)->isSame(TE->Scalars)))) {
16892 Entries.push_back(*It);
16893 if ((*It)->getVectorFactor() == VL.size()) {
16894 std::iota(std::next(Mask.begin(), Part * VL.size()),
16895 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
16896 } else {
16897 SmallVector<int> CommonMask = TE->getCommonMask();
16898 copy(CommonMask, Mask.begin());
16899 }
16900 // Clear undef scalars.
16901 for (unsigned I : seq<unsigned>(VL.size()))
16902 if (isa<PoisonValue>(VL[I]))
16903 Mask[Part * VL.size() + I] = PoisonMaskElem;
16905 }
16906 // No perfect match, just shuffle, so choose the first tree node from the
16907 // tree.
16908 Entries.push_back(FirstEntries.front());
16909 // Update mapping between values and corresponding tree entries.
16910 for (auto &P : UsedValuesEntry)
16911 P.second = 0;
16912 VF = FirstEntries.front()->getVectorFactor();
16913 } else {
16914 // Try to find nodes with the same vector factor.
16915 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
16916 // Keep the order of tree nodes to avoid non-determinism.
16917 DenseMap<int, const TreeEntry *> VFToTE;
16918 for (const TreeEntry *TE : UsedTEs.front()) {
16919 unsigned VF = TE->getVectorFactor();
16920 auto It = VFToTE.find(VF);
16921 if (It != VFToTE.end()) {
16922 if (It->second->Idx > TE->Idx)
16923 It->getSecond() = TE;
16924 continue;
16925 }
16926 VFToTE.try_emplace(VF, TE);
16927 }
16928 // Same, keep the order to avoid non-determinism.
16929 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
16930 UsedTEs.back().end());
16931 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
16932 return TE1->Idx < TE2->Idx;
16933 });
16934 for (const TreeEntry *TE : SecondEntries) {
16935 auto It = VFToTE.find(TE->getVectorFactor());
16936 if (It != VFToTE.end()) {
16937 VF = It->first;
16938 Entries.push_back(It->second);
16939 Entries.push_back(TE);
16940 break;
16941 }
16942 }
16943 // No 2 source vectors with the same vector factor - just choose 2 with max
16944 // index.
16945 if (Entries.empty()) {
16947 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
16948 return TE1->Idx < TE2->Idx;
16949 }));
16950 Entries.push_back(SecondEntries.front());
16951 VF = std::max(Entries.front()->getVectorFactor(),
16952 Entries.back()->getVectorFactor());
16953 } else {
16954 VF = Entries.front()->getVectorFactor();
16955 }
16956 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
16957 for (const TreeEntry *E : Entries)
16958 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
16959 E->Scalars.end());
16960 // Update mapping between values and corresponding tree entries.
16961 for (auto &P : UsedValuesEntry) {
16962 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
16963 if (ValuesToEntries[Idx].contains(P.first)) {
16964 P.second = Idx;
16965 break;
16966 }
16967 }
16968 }
16969
16970 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
16971 // Checks if the 2 PHIs are compatible in terms of high possibility to be
16972 // vectorized.
16973 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
16974 auto *PHI = cast<PHINode>(V);
16975 auto *PHI1 = cast<PHINode>(V1);
16976 // Check that all incoming values are compatible/from same parent (if they
16977 // are instructions).
16978 // The incoming values are compatible if they all are constants, or
16979 // instruction with the same/alternate opcodes from the same basic block.
16980 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
16981 Value *In = PHI->getIncomingValue(I);
16982 Value *In1 = PHI1->getIncomingValue(I);
16983 if (isConstant(In) && isConstant(In1))
16984 continue;
16985 if (!getSameOpcode({In, In1}, *TLI))
16986 return false;
16987 if (cast<Instruction>(In)->getParent() !=
16989 return false;
16990 }
16991 return true;
16992 };
16993 // Check if the value can be ignored during analysis for shuffled gathers.
16994 // We suppose it is better to ignore instruction, which do not form splats,
16995 // are not vectorized/not extractelements (these instructions will be handled
16996 // by extractelements processing) or may form vector node in future.
16997 auto MightBeIgnored = [=](Value *V) {
16998 auto *I = dyn_cast<Instruction>(V);
16999 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17001 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17002 };
17003 // Check that the neighbor instruction may form a full vector node with the
17004 // current instruction V. It is possible, if they have same/alternate opcode
17005 // and same parent basic block.
17006 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17007 Value *V1 = VL[Idx];
17008 bool UsedInSameVTE = false;
17009 auto It = UsedValuesEntry.find(V1);
17010 if (It != UsedValuesEntry.end())
17011 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17012 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17013 getSameOpcode({V, V1}, *TLI) &&
17014 cast<Instruction>(V)->getParent() ==
17015 cast<Instruction>(V1)->getParent() &&
17016 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17017 };
17018 // Build a shuffle mask for better cost estimation and vector emission.
17019 SmallBitVector UsedIdxs(Entries.size());
17021 for (int I = 0, E = VL.size(); I < E; ++I) {
17022 Value *V = VL[I];
17023 auto It = UsedValuesEntry.find(V);
17024 if (It == UsedValuesEntry.end())
17025 continue;
17026 // Do not try to shuffle scalars, if they are constants, or instructions
17027 // that can be vectorized as a result of the following vector build
17028 // vectorization.
17029 if (isConstant(V) || (MightBeIgnored(V) &&
17030 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17031 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17032 continue;
17033 unsigned Idx = It->second;
17034 EntryLanes.emplace_back(Idx, I);
17035 UsedIdxs.set(Idx);
17036 }
17037 // Iterate through all shuffled scalars and select entries, which can be used
17038 // for final shuffle.
17040 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17041 if (!UsedIdxs.test(I))
17042 continue;
17043 // Fix the entry number for the given scalar. If it is the first entry, set
17044 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17045 // These indices are used when calculating final shuffle mask as the vector
17046 // offset.
17047 for (std::pair<unsigned, int> &Pair : EntryLanes)
17048 if (Pair.first == I)
17049 Pair.first = TempEntries.size();
17050 TempEntries.push_back(Entries[I]);
17051 }
17052 Entries.swap(TempEntries);
17053 if (EntryLanes.size() == Entries.size() &&
17054 !VL.equals(ArrayRef(TE->Scalars)
17055 .slice(Part * VL.size(),
17056 std::min<int>(VL.size(), TE->Scalars.size())))) {
17057 // We may have here 1 or 2 entries only. If the number of scalars is equal
17058 // to the number of entries, no need to do the analysis, it is not very
17059 // profitable. Since VL is not the same as TE->Scalars, it means we already
17060 // have some shuffles before. Cut off not profitable case.
17061 Entries.clear();
17062 return std::nullopt;
17063 }
17064 // Build the final mask, check for the identity shuffle, if possible.
17065 bool IsIdentity = Entries.size() == 1;
17066 // Pair.first is the offset to the vector, while Pair.second is the index of
17067 // scalar in the list.
17068 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17069 unsigned Idx = Part * VL.size() + Pair.second;
17070 Mask[Idx] =
17071 Pair.first * VF +
17072 (ForOrder ? std::distance(
17073 Entries[Pair.first]->Scalars.begin(),
17074 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17075 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17076 IsIdentity &= Mask[Idx] == Pair.second;
17077 }
17078 if (ForOrder || IsIdentity || Entries.empty()) {
17079 switch (Entries.size()) {
17080 case 1:
17081 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17083 break;
17084 case 2:
17085 if (EntryLanes.size() > 2 || VL.size() <= 2)
17087 break;
17088 default:
17089 break;
17090 }
17091 } else if (!isa<VectorType>(VL.front()->getType()) &&
17092 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17093 // Do the cost estimation if shuffle beneficial than buildvector.
17094 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17095 std::next(Mask.begin(), (Part + 1) * VL.size()));
17096 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17097 for (int Idx : SubMask) {
17098 if (Idx == PoisonMaskElem)
17099 continue;
17100 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17101 MinElement = Idx;
17102 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17103 MaxElement = Idx;
17104 }
17105 assert(MaxElement >= 0 && MinElement >= 0 &&
17106 MaxElement % VF >= MinElement % VF &&
17107 "Expected at least single element.");
17108 unsigned NewVF = std::max<unsigned>(
17109 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17110 (MaxElement % VF) -
17111 (MinElement % VF) + 1));
17112 if (NewVF < VF) {
17113 for (int &Idx : SubMask) {
17114 if (Idx == PoisonMaskElem)
17115 continue;
17116 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17117 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17118 }
17119 } else {
17120 NewVF = VF;
17121 }
17122
17124 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17125 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17126 auto GetShuffleCost = [&,
17127 &TTI = *TTI](ArrayRef<int> Mask,
17129 VectorType *VecTy) -> InstructionCost {
17130 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17132 Mask, Entries.front()->getInterleaveFactor()))
17133 return TTI::TCC_Free;
17134 return ::getShuffleCost(TTI,
17135 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17137 VecTy, Mask, CostKind);
17138 };
17139 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17140 InstructionCost FirstShuffleCost = 0;
17141 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17142 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17143 FirstShuffleCost = ShuffleCost;
17144 } else {
17145 // Transform mask to include only first entry.
17146 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17147 bool IsIdentity = true;
17148 for (auto [I, Idx] : enumerate(FirstMask)) {
17149 if (Idx >= static_cast<int>(NewVF)) {
17150 Idx = PoisonMaskElem;
17151 } else {
17152 DemandedElts.clearBit(I);
17153 if (Idx != PoisonMaskElem)
17154 IsIdentity &= static_cast<int>(I) == Idx;
17155 }
17156 }
17157 if (!IsIdentity)
17158 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17159 FirstShuffleCost += getScalarizationOverhead(
17160 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17161 /*Extract=*/false, CostKind);
17162 }
17163 InstructionCost SecondShuffleCost = 0;
17164 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17165 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17166 SecondShuffleCost = ShuffleCost;
17167 } else {
17168 // Transform mask to include only first entry.
17169 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17170 bool IsIdentity = true;
17171 for (auto [I, Idx] : enumerate(SecondMask)) {
17172 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
17173 Idx = PoisonMaskElem;
17174 } else {
17175 DemandedElts.clearBit(I);
17176 if (Idx != PoisonMaskElem) {
17177 Idx -= NewVF;
17178 IsIdentity &= static_cast<int>(I) == Idx;
17179 }
17180 }
17181 }
17182 if (!IsIdentity)
17183 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17184 SecondShuffleCost += getScalarizationOverhead(
17185 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17186 /*Extract=*/false, CostKind);
17187 }
17188 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17189 for (auto [I, Idx] : enumerate(SubMask))
17190 if (Idx == PoisonMaskElem)
17191 DemandedElts.clearBit(I);
17192 InstructionCost BuildVectorCost = getScalarizationOverhead(
17193 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17194 /*Extract=*/false, CostKind);
17195 const TreeEntry *BestEntry = nullptr;
17196 if (FirstShuffleCost < ShuffleCost) {
17197 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17198 std::next(Mask.begin(), (Part + 1) * VL.size()),
17199 [&](int &Idx) {
17200 if (Idx >= static_cast<int>(VF))
17201 Idx = PoisonMaskElem;
17202 });
17203 BestEntry = Entries.front();
17204 ShuffleCost = FirstShuffleCost;
17205 }
17206 if (SecondShuffleCost < ShuffleCost) {
17207 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17208 std::next(Mask.begin(), (Part + 1) * VL.size()),
17209 [&](int &Idx) {
17210 if (Idx < static_cast<int>(VF))
17211 Idx = PoisonMaskElem;
17212 else
17213 Idx -= VF;
17214 });
17215 BestEntry = Entries[1];
17216 ShuffleCost = SecondShuffleCost;
17217 }
17218 if (BuildVectorCost >= ShuffleCost) {
17219 if (BestEntry) {
17220 Entries.clear();
17221 Entries.push_back(BestEntry);
17222 }
17223 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
17225 }
17226 }
17227 Entries.clear();
17228 // Clear the corresponding mask elements.
17229 std::fill(std::next(Mask.begin(), Part * VL.size()),
17230 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
17231 return std::nullopt;
17232}
17233
17235BoUpSLP::isGatherShuffledEntry(
17236 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
17237 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
17238 bool ForOrder) {
17239 assert(NumParts > 0 && NumParts < VL.size() &&
17240 "Expected positive number of registers.");
17241 Entries.clear();
17242 // No need to check for the topmost gather node.
17243 if (TE == VectorizableTree.front().get() &&
17244 (!GatheredLoadsEntriesFirst.has_value() ||
17245 none_of(ArrayRef(VectorizableTree).drop_front(),
17246 [](const std::unique_ptr<TreeEntry> &TE) {
17247 return !TE->isGather();
17248 })))
17249 return {};
17250 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
17251 // implemented yet.
17252 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17253 return {};
17254 Mask.assign(VL.size(), PoisonMaskElem);
17255 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17256 "Expected only single user of the gather node.");
17257 assert(VL.size() % NumParts == 0 &&
17258 "Number of scalars must be divisible by NumParts.");
17259 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
17260 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17261 (TE->Idx == 0 ||
17262 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
17263 isSplat(TE->Scalars) ||
17264 (TE->hasState() &&
17265 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
17266 return {};
17267 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17269 for (unsigned Part : seq<unsigned>(NumParts)) {
17270 ArrayRef<Value *> SubVL =
17271 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17272 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17273 std::optional<TTI::ShuffleKind> SubRes =
17274 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17275 ForOrder);
17276 if (!SubRes)
17277 SubEntries.clear();
17278 Res.push_back(SubRes);
17279 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
17280 SubEntries.front()->getVectorFactor() == VL.size() &&
17281 (SubEntries.front()->isSame(TE->Scalars) ||
17282 SubEntries.front()->isSame(VL))) {
17283 SmallVector<const TreeEntry *> LocalSubEntries;
17284 LocalSubEntries.swap(SubEntries);
17285 Entries.clear();
17286 Res.clear();
17287 std::iota(Mask.begin(), Mask.end(), 0);
17288 // Clear undef scalars.
17289 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
17290 if (isa<PoisonValue>(VL[I]))
17292 Entries.emplace_back(1, LocalSubEntries.front());
17294 return Res;
17295 }
17296 }
17297 if (all_of(Res,
17298 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
17299 Entries.clear();
17300 return {};
17301 }
17302 return Res;
17303}
17304
17305InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
17306 Type *ScalarTy) const {
17307 const unsigned VF = VL.size();
17308 auto *VecTy = getWidenedType(ScalarTy, VF);
17309 // Find the cost of inserting/extracting values from the vector.
17310 // Check if the same elements are inserted several times and count them as
17311 // shuffle candidates.
17312 APInt DemandedElements = APInt::getZero(VF);
17315 auto EstimateInsertCost = [&](unsigned I, Value *V) {
17316 DemandedElements.setBit(I);
17317 if (V->getType() != ScalarTy)
17318 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
17320 };
17321 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
17322 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17323 for (auto [I, V] : enumerate(VL)) {
17324 // No need to shuffle duplicates for constants.
17325 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
17326 continue;
17327
17328 if (isConstant(V)) {
17329 ConstantShuffleMask[I] = I + VF;
17330 continue;
17331 }
17332 EstimateInsertCost(I, V);
17333 }
17334 // FIXME: add a cost for constant vector materialization.
17335 bool IsAnyNonUndefConst =
17336 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
17337 // 1. Shuffle input source vector and constant vector.
17338 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17340 ConstantShuffleMask);
17341 }
17342
17343 // 2. Insert unique non-constants.
17344 if (!DemandedElements.isZero())
17345 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
17346 /*Insert=*/true,
17347 /*Extract=*/false, CostKind,
17348 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17349 return Cost;
17350}
17351
17352Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
17353 auto It = EntryToLastInstruction.find(E);
17354 if (It != EntryToLastInstruction.end())
17355 return *cast<Instruction>(It->second);
17356 Instruction *Res = nullptr;
17357 // Get the basic block this bundle is in. All instructions in the bundle
17358 // should be in this block (except for extractelement-like instructions with
17359 // constant indices or gathered loads or copyables).
17360 Instruction *Front;
17361 unsigned Opcode;
17362 if (E->hasState()) {
17363 Front = E->getMainOp();
17364 Opcode = E->getOpcode();
17365 } else {
17366 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
17367 Opcode = Front->getOpcode();
17368 }
17369 auto *BB = Front->getParent();
17370 assert(
17371 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17372 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17373 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17374 all_of(E->Scalars,
17375 [=](Value *V) -> bool {
17376 if (Opcode == Instruction::GetElementPtr &&
17377 !isa<GetElementPtrInst>(V))
17378 return true;
17379 auto *I = dyn_cast<Instruction>(V);
17380 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17381 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17382 })) &&
17383 "Expected gathered loads or GEPs or instructions from same basic "
17384 "block.");
17385
17386 auto FindLastInst = [&]() {
17387 Instruction *LastInst = Front;
17388 for (Value *V : E->Scalars) {
17389 auto *I = dyn_cast<Instruction>(V);
17390 if (!I)
17391 continue;
17392 if (E->isCopyableElement(I))
17393 continue;
17394 if (LastInst->getParent() == I->getParent()) {
17395 if (LastInst->comesBefore(I))
17396 LastInst = I;
17397 continue;
17398 }
17399 assert(((Opcode == Instruction::GetElementPtr &&
17401 E->State == TreeEntry::SplitVectorize ||
17402 (isVectorLikeInstWithConstOps(LastInst) &&
17404 (GatheredLoadsEntriesFirst.has_value() &&
17405 Opcode == Instruction::Load && E->isGather() &&
17406 E->Idx < *GatheredLoadsEntriesFirst)) &&
17407 "Expected vector-like or non-GEP in GEP node insts only.");
17408 if (!DT->isReachableFromEntry(LastInst->getParent())) {
17409 LastInst = I;
17410 continue;
17411 }
17412 if (!DT->isReachableFromEntry(I->getParent()))
17413 continue;
17414 auto *NodeA = DT->getNode(LastInst->getParent());
17415 auto *NodeB = DT->getNode(I->getParent());
17416 assert(NodeA && "Should only process reachable instructions");
17417 assert(NodeB && "Should only process reachable instructions");
17418 assert((NodeA == NodeB) ==
17419 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17420 "Different nodes should have different DFS numbers");
17421 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17422 LastInst = I;
17423 }
17424 BB = LastInst->getParent();
17425 return LastInst;
17426 };
17427
17428 auto FindFirstInst = [&]() {
17429 Instruction *FirstInst = Front;
17430 for (Value *V : E->Scalars) {
17431 auto *I = dyn_cast<Instruction>(V);
17432 if (!I)
17433 continue;
17434 if (E->isCopyableElement(I))
17435 continue;
17436 if (FirstInst->getParent() == I->getParent()) {
17437 if (I->comesBefore(FirstInst))
17438 FirstInst = I;
17439 continue;
17440 }
17441 assert(((Opcode == Instruction::GetElementPtr &&
17443 (isVectorLikeInstWithConstOps(FirstInst) &&
17445 "Expected vector-like or non-GEP in GEP node insts only.");
17446 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
17447 FirstInst = I;
17448 continue;
17449 }
17450 if (!DT->isReachableFromEntry(I->getParent()))
17451 continue;
17452 auto *NodeA = DT->getNode(FirstInst->getParent());
17453 auto *NodeB = DT->getNode(I->getParent());
17454 assert(NodeA && "Should only process reachable instructions");
17455 assert(NodeB && "Should only process reachable instructions");
17456 assert((NodeA == NodeB) ==
17457 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17458 "Different nodes should have different DFS numbers");
17459 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17460 FirstInst = I;
17461 }
17462 return FirstInst;
17463 };
17464
17465 if (E->State == TreeEntry::SplitVectorize) {
17466 Res = FindLastInst();
17467 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
17468 for (auto *E : Entries) {
17469 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17470 if (!I)
17471 I = &getLastInstructionInBundle(E);
17472 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
17473 Res = I;
17474 }
17475 }
17476 EntryToLastInstruction.try_emplace(E, Res);
17477 return *Res;
17478 }
17479
17480 // Set insertpoint for gathered loads to the very first load.
17481 if (GatheredLoadsEntriesFirst.has_value() &&
17482 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17483 Opcode == Instruction::Load) {
17484 Res = FindFirstInst();
17485 EntryToLastInstruction.try_emplace(E, Res);
17486 return *Res;
17487 }
17488
17489 // Set the insert point to the beginning of the basic block if the entry
17490 // should not be scheduled.
17491 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
17492 if (E->isGather())
17493 return nullptr;
17494 // Found previously that the instruction do not need to be scheduled.
17495 const auto *It = BlocksSchedules.find(BB);
17496 if (It == BlocksSchedules.end())
17497 return nullptr;
17498 for (Value *V : E->Scalars) {
17499 auto *I = dyn_cast<Instruction>(V);
17500 if (!I || isa<PHINode>(I) ||
17501 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
17502 continue;
17503 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
17504 if (Bundles.empty())
17505 continue;
17506 const auto *It = find_if(
17507 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
17508 if (It != Bundles.end())
17509 return *It;
17510 }
17511 return nullptr;
17512 };
17513 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17514 if (!E->isGather() && !Bundle) {
17515 if ((Opcode == Instruction::GetElementPtr &&
17516 any_of(E->Scalars,
17517 [](Value *V) {
17518 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17519 })) ||
17520 all_of(E->Scalars, [&](Value *V) {
17521 return isa<PoisonValue>(V) ||
17522 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17523 E->isCopyableElement(V) ||
17524 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17525 }))
17526 Res = FindLastInst();
17527 else
17528 Res = FindFirstInst();
17529 EntryToLastInstruction.try_emplace(E, Res);
17530 return *Res;
17531 }
17532
17533 // Find the last instruction. The common case should be that BB has been
17534 // scheduled, and the last instruction is VL.back(). So we start with
17535 // VL.back() and iterate over schedule data until we reach the end of the
17536 // bundle. The end of the bundle is marked by null ScheduleData.
17537 if (Bundle) {
17538 assert(!E->isGather() && "Gathered instructions should not be scheduled");
17539 Res = Bundle->getBundle().back()->getInst();
17540 EntryToLastInstruction.try_emplace(E, Res);
17541 return *Res;
17542 }
17543
17544 // LastInst can still be null at this point if there's either not an entry
17545 // for BB in BlocksSchedules or there's no ScheduleData available for
17546 // VL.back(). This can be the case if buildTreeRec aborts for various
17547 // reasons (e.g., the maximum recursion depth is reached, the maximum region
17548 // size is reached, etc.). ScheduleData is initialized in the scheduling
17549 // "dry-run".
17550 //
17551 // If this happens, we can still find the last instruction by brute force. We
17552 // iterate forwards from Front (inclusive) until we either see all
17553 // instructions in the bundle or reach the end of the block. If Front is the
17554 // last instruction in program order, LastInst will be set to Front, and we
17555 // will visit all the remaining instructions in the block.
17556 //
17557 // One of the reasons we exit early from buildTreeRec is to place an upper
17558 // bound on compile-time. Thus, taking an additional compile-time hit here is
17559 // not ideal. However, this should be exceedingly rare since it requires that
17560 // we both exit early from buildTreeRec and that the bundle be out-of-order
17561 // (causing us to iterate all the way to the end of the block).
17562 if (!Res)
17563 Res = FindLastInst();
17564 assert(Res && "Failed to find last instruction in bundle");
17565 EntryToLastInstruction.try_emplace(E, Res);
17566 return *Res;
17567}
17568
17569void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
17570 auto *Front = E->getMainOp();
17571 Instruction *LastInst = &getLastInstructionInBundle(E);
17572 assert(LastInst && "Failed to find last instruction in bundle");
17573 BasicBlock::iterator LastInstIt = LastInst->getIterator();
17574 // If the instruction is PHI, set the insert point after all the PHIs.
17575 bool IsPHI = isa<PHINode>(LastInst);
17576 if (IsPHI) {
17577 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
17578 if (LastInstIt != LastInst->getParent()->end() &&
17579 LastInstIt->getParent()->isLandingPad())
17580 LastInstIt = std::next(LastInstIt);
17581 }
17582 if (IsPHI ||
17583 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17584 E->doesNotNeedToSchedule()) ||
17585 (GatheredLoadsEntriesFirst.has_value() &&
17586 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17587 E->getOpcode() == Instruction::Load)) {
17588 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
17589 } else {
17590 // Set the insertion point after the last instruction in the bundle. Set the
17591 // debug location to Front.
17592 Builder.SetInsertPoint(
17593 LastInst->getParent(),
17594 LastInst->getNextNode()->getIterator());
17595 }
17596 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
17597}
17598
17599Value *BoUpSLP::gather(
17600 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
17601 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
17602 // List of instructions/lanes from current block and/or the blocks which are
17603 // part of the current loop. These instructions will be inserted at the end to
17604 // make it possible to optimize loops and hoist invariant instructions out of
17605 // the loops body with better chances for success.
17607 SmallSet<int, 4> PostponedIndices;
17608 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
17609 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
17610 SmallPtrSet<BasicBlock *, 4> Visited;
17611 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
17612 InsertBB = InsertBB->getSinglePredecessor();
17613 return InsertBB && InsertBB == InstBB;
17614 };
17615 for (int I = 0, E = VL.size(); I < E; ++I) {
17616 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
17617 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17618 isVectorized(Inst) ||
17619 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
17620 PostponedIndices.insert(I).second)
17621 PostponedInsts.emplace_back(Inst, I);
17622 }
17623
17624 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
17625 Type *Ty) {
17626 Value *Scalar = V;
17627 if (Scalar->getType() != Ty) {
17628 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17629 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
17630 Value *V = Scalar;
17631 if (auto *CI = dyn_cast<CastInst>(Scalar);
17633 Value *Op = CI->getOperand(0);
17634 if (auto *IOp = dyn_cast<Instruction>(Op);
17635 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
17636 V = Op;
17637 }
17638 Scalar = Builder.CreateIntCast(
17639 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
17640 }
17641
17642 Instruction *InsElt;
17643 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17644 assert(SLPReVec && "FixedVectorType is not expected.");
17645 Vec =
17646 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
17647 auto *II = dyn_cast<Instruction>(Vec);
17648 if (!II)
17649 return Vec;
17650 InsElt = II;
17651 } else {
17652 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17653 InsElt = dyn_cast<InsertElementInst>(Vec);
17654 if (!InsElt)
17655 return Vec;
17656 }
17657 GatherShuffleExtractSeq.insert(InsElt);
17658 CSEBlocks.insert(InsElt->getParent());
17659 // Add to our 'need-to-extract' list.
17660 if (isa<Instruction>(V)) {
17661 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
17662 // Find which lane we need to extract.
17663 User *UserOp = nullptr;
17664 if (Scalar != V) {
17665 if (auto *SI = dyn_cast<Instruction>(Scalar))
17666 UserOp = SI;
17667 } else {
17668 if (V->getType()->isVectorTy()) {
17669 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
17670 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17671 // Find shufflevector, caused by resize.
17672 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
17673 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
17674 if (SV->getOperand(0) == V)
17675 return SV;
17676 if (SV->getOperand(1) == V)
17677 return SV;
17678 }
17679 return nullptr;
17680 };
17681 InsElt = nullptr;
17682 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17683 InsElt = User;
17684 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17685 InsElt = User;
17686 assert(InsElt &&
17687 "Failed to find shufflevector, caused by resize.");
17688 }
17689 }
17690 UserOp = InsElt;
17691 }
17692 if (UserOp) {
17693 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17694 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17695 }
17696 }
17697 }
17698 return Vec;
17699 };
17700 auto *VecTy = getWidenedType(ScalarTy, VL.size());
17701 Value *Vec = PoisonValue::get(VecTy);
17702 SmallVector<int> NonConsts;
17703 SmallVector<int> Mask(VL.size());
17704 std::iota(Mask.begin(), Mask.end(), 0);
17705 Value *OriginalRoot = Root;
17706 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
17707 SV && isa<PoisonValue>(SV->getOperand(1)) &&
17708 SV->getOperand(0)->getType() == VecTy) {
17709 Root = SV->getOperand(0);
17710 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17711 }
17712 // Insert constant values at first.
17713 for (int I = 0, E = VL.size(); I < E; ++I) {
17714 if (PostponedIndices.contains(I))
17715 continue;
17716 if (!isConstant(VL[I])) {
17717 NonConsts.push_back(I);
17718 continue;
17719 }
17720 if (isa<PoisonValue>(VL[I]))
17721 continue;
17722 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17723 Mask[I] = I + E;
17724 }
17725 if (Root) {
17726 if (isa<PoisonValue>(Vec)) {
17727 Vec = OriginalRoot;
17728 } else {
17729 Vec = CreateShuffle(Root, Vec, Mask);
17730 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
17731 OI && OI->use_empty() &&
17732 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
17733 return TE->VectorizedValue == OI;
17734 }))
17735 eraseInstruction(OI);
17736 }
17737 }
17738 // Insert non-constant values.
17739 for (int I : NonConsts)
17740 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17741 // Append instructions, which are/may be part of the loop, in the end to make
17742 // it possible to hoist non-loop-based instructions.
17743 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17744 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17745
17746 return Vec;
17747}
17748
17749/// Merges shuffle masks and emits final shuffle instruction, if required. It
17750/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
17751/// when the actual shuffle instruction is generated only if this is actually
17752/// required. Otherwise, the shuffle instruction emission is delayed till the
17753/// end of the process, to reduce the number of emitted instructions and further
17754/// analysis/transformations.
17755/// The class also will look through the previously emitted shuffle instructions
17756/// and properly mark indices in mask as undef.
17757/// For example, given the code
17758/// \code
17759/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
17760/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
17761/// \endcode
17762/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
17763/// look through %s1 and %s2 and emit
17764/// \code
17765/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17766/// \endcode
17767/// instead.
17768/// If 2 operands are of different size, the smallest one will be resized and
17769/// the mask recalculated properly.
17770/// For example, given the code
17771/// \code
17772/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
17773/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
17774/// \endcode
17775/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
17776/// look through %s1 and %s2 and emit
17777/// \code
17778/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17779/// \endcode
17780/// instead.
17781class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
17782 bool IsFinalized = false;
17783 /// Combined mask for all applied operands and masks. It is built during
17784 /// analysis and actual emission of shuffle vector instructions.
17785 SmallVector<int> CommonMask;
17786 /// List of operands for the shuffle vector instruction. It hold at max 2
17787 /// operands, if the 3rd is going to be added, the first 2 are combined into
17788 /// shuffle with \p CommonMask mask, the first operand sets to be the
17789 /// resulting shuffle and the second operand sets to be the newly added
17790 /// operand. The \p CommonMask is transformed in the proper way after that.
17791 SmallVector<Value *, 2> InVectors;
17792 IRBuilderBase &Builder;
17793 BoUpSLP &R;
17794
17795 class ShuffleIRBuilder {
17796 IRBuilderBase &Builder;
17797 /// Holds all of the instructions that we gathered.
17798 SetVector<Instruction *> &GatherShuffleExtractSeq;
17799 /// A list of blocks that we are going to CSE.
17800 DenseSet<BasicBlock *> &CSEBlocks;
17801 /// Data layout.
17802 const DataLayout &DL;
17803
17804 public:
17805 ShuffleIRBuilder(IRBuilderBase &Builder,
17806 SetVector<Instruction *> &GatherShuffleExtractSeq,
17807 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
17808 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17809 CSEBlocks(CSEBlocks), DL(DL) {}
17810 ~ShuffleIRBuilder() = default;
17811 /// Creates shufflevector for the 2 operands with the given mask.
17812 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
17813 if (V1->getType() != V2->getType()) {
17815 V1->getType()->isIntOrIntVectorTy() &&
17816 "Expected integer vector types only.");
17817 if (V1->getType() != V2->getType()) {
17818 if (cast<VectorType>(V2->getType())
17819 ->getElementType()
17820 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
17821 ->getElementType()
17822 ->getIntegerBitWidth())
17823 V2 = Builder.CreateIntCast(
17824 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
17825 else
17826 V1 = Builder.CreateIntCast(
17827 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
17828 }
17829 }
17830 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17831 if (auto *I = dyn_cast<Instruction>(Vec)) {
17832 GatherShuffleExtractSeq.insert(I);
17833 CSEBlocks.insert(I->getParent());
17834 }
17835 return Vec;
17836 }
17837 /// Creates permutation of the single vector operand with the given mask, if
17838 /// it is not identity mask.
17839 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
17840 if (Mask.empty())
17841 return V1;
17842 unsigned VF = Mask.size();
17843 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
17844 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
17845 return V1;
17846 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17847 if (auto *I = dyn_cast<Instruction>(Vec)) {
17848 GatherShuffleExtractSeq.insert(I);
17849 CSEBlocks.insert(I->getParent());
17850 }
17851 return Vec;
17852 }
17853 Value *createIdentity(Value *V) { return V; }
17854 Value *createPoison(Type *Ty, unsigned VF) {
17855 return PoisonValue::get(getWidenedType(Ty, VF));
17856 }
17857 /// Resizes 2 input vector to match the sizes, if the they are not equal
17858 /// yet. The smallest vector is resized to the size of the larger vector.
17859 void resizeToMatch(Value *&V1, Value *&V2) {
17860 if (V1->getType() == V2->getType())
17861 return;
17862 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
17863 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
17864 int VF = std::max(V1VF, V2VF);
17865 int MinVF = std::min(V1VF, V2VF);
17866 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
17867 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
17868 0);
17869 Value *&Op = MinVF == V1VF ? V1 : V2;
17870 Op = Builder.CreateShuffleVector(Op, IdentityMask);
17871 if (auto *I = dyn_cast<Instruction>(Op)) {
17872 GatherShuffleExtractSeq.insert(I);
17873 CSEBlocks.insert(I->getParent());
17874 }
17875 if (MinVF == V1VF)
17876 V1 = Op;
17877 else
17878 V2 = Op;
17879 }
17880 };
17881
17882 /// Smart shuffle instruction emission, walks through shuffles trees and
17883 /// tries to find the best matching vector for the actual shuffle
17884 /// instruction.
17885 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
17886 assert(V1 && "Expected at least one vector value.");
17887 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
17888 R.CSEBlocks, *R.DL);
17889 return BaseShuffleAnalysis::createShuffle<Value *>(
17890 V1, V2, Mask, ShuffleBuilder, ScalarTy);
17891 }
17892
17893 /// Cast value \p V to the vector type with the same number of elements, but
17894 /// the base type \p ScalarTy.
17895 Value *castToScalarTyElem(Value *V,
17896 std::optional<bool> IsSigned = std::nullopt) {
17897 auto *VecTy = cast<VectorType>(V->getType());
17898 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
17899 if (VecTy->getElementType() == ScalarTy->getScalarType())
17900 return V;
17901 return Builder.CreateIntCast(
17902 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
17903 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
17904 }
17905
17906 Value *getVectorizedValue(const TreeEntry &E) {
17907 Value *Vec = E.VectorizedValue;
17908 if (!Vec->getType()->isIntOrIntVectorTy())
17909 return Vec;
17910 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
17911 return !isa<PoisonValue>(V) &&
17912 !isKnownNonNegative(
17913 V, SimplifyQuery(*R.DL));
17914 }));
17915 }
17916
17917public:
17919 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
17920
17921 /// Adjusts extractelements after reusing them.
17922 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
17923 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
17924 unsigned NumParts, bool &UseVecBaseAsInput) {
17925 UseVecBaseAsInput = false;
17926 SmallPtrSet<Value *, 4> UniqueBases;
17927 Value *VecBase = nullptr;
17928 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
17929 if (!E->ReorderIndices.empty()) {
17930 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
17931 E->ReorderIndices.end());
17932 reorderScalars(VL, ReorderMask);
17933 }
17934 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
17935 int Idx = Mask[I];
17936 if (Idx == PoisonMaskElem)
17937 continue;
17938 auto *EI = cast<ExtractElementInst>(VL[I]);
17939 VecBase = EI->getVectorOperand();
17940 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
17941 VecBase = TEs.front()->VectorizedValue;
17942 assert(VecBase && "Expected vectorized value.");
17943 UniqueBases.insert(VecBase);
17944 // If the only one use is vectorized - can delete the extractelement
17945 // itself.
17946 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
17947 (NumParts != 1 && count(VL, EI) > 1) ||
17948 any_of(EI->users(), [&](User *U) {
17949 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
17950 return UTEs.empty() || UTEs.size() > 1 ||
17951 (isa<GetElementPtrInst>(U) &&
17952 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
17953 (!UTEs.empty() &&
17954 count_if(R.VectorizableTree,
17955 [&](const std::unique_ptr<TreeEntry> &TE) {
17956 return TE->UserTreeIndex.UserTE ==
17957 UTEs.front() &&
17958 is_contained(VL, EI);
17959 }) != 1);
17960 }))
17961 continue;
17962 R.eraseInstruction(EI);
17963 }
17964 if (NumParts == 1 || UniqueBases.size() == 1) {
17965 assert(VecBase && "Expected vectorized value.");
17966 return castToScalarTyElem(VecBase);
17967 }
17968 UseVecBaseAsInput = true;
17969 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
17970 for (auto [I, Idx] : enumerate(Mask))
17971 if (Idx != PoisonMaskElem)
17972 Idx = I;
17973 };
17974 // Perform multi-register vector shuffle, joining them into a single virtual
17975 // long vector.
17976 // Need to shuffle each part independently and then insert all this parts
17977 // into a long virtual vector register, forming the original vector.
17978 Value *Vec = nullptr;
17979 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
17980 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17981 for (unsigned Part : seq<unsigned>(NumParts)) {
17982 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
17983 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
17984 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
17985 constexpr int MaxBases = 2;
17986 SmallVector<Value *, MaxBases> Bases(MaxBases);
17987 auto VLMask = zip(SubVL, SubMask);
17988 const unsigned VF = std::accumulate(
17989 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
17990 if (std::get<1>(D) == PoisonMaskElem)
17991 return S;
17992 Value *VecOp =
17993 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
17994 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
17995 !TEs.empty())
17996 VecOp = TEs.front()->VectorizedValue;
17997 assert(VecOp && "Expected vectorized value.");
17998 const unsigned Size =
17999 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18000 return std::max(S, Size);
18001 });
18002 for (const auto [V, I] : VLMask) {
18003 if (I == PoisonMaskElem)
18004 continue;
18005 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18006 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18007 VecOp = TEs.front()->VectorizedValue;
18008 assert(VecOp && "Expected vectorized value.");
18009 VecOp = castToScalarTyElem(VecOp);
18010 Bases[I / VF] = VecOp;
18011 }
18012 if (!Bases.front())
18013 continue;
18014 Value *SubVec;
18015 if (Bases.back()) {
18016 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18017 TransformToIdentity(SubMask);
18018 } else {
18019 SubVec = Bases.front();
18020 }
18021 if (!Vec) {
18022 Vec = SubVec;
18023 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18024 [&](unsigned P) {
18025 ArrayRef<int> SubMask =
18026 Mask.slice(P * SliceSize,
18027 getNumElems(Mask.size(),
18028 SliceSize, P));
18029 return all_of(SubMask, [](int Idx) {
18030 return Idx == PoisonMaskElem;
18031 });
18032 })) &&
18033 "Expected first part or all previous parts masked.");
18034 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18035 } else {
18036 unsigned NewVF =
18037 cast<FixedVectorType>(Vec->getType())->getNumElements();
18038 if (Vec->getType() != SubVec->getType()) {
18039 unsigned SubVecVF =
18040 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18041 NewVF = std::max(NewVF, SubVecVF);
18042 }
18043 // Adjust SubMask.
18044 for (int &Idx : SubMask)
18045 if (Idx != PoisonMaskElem)
18046 Idx += NewVF;
18047 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18048 Vec = createShuffle(Vec, SubVec, VecMask);
18049 TransformToIdentity(VecMask);
18050 }
18051 }
18052 copy(VecMask, Mask.begin());
18053 return Vec;
18054 }
18055 /// Checks if the specified entry \p E needs to be delayed because of its
18056 /// dependency nodes.
18057 std::optional<Value *>
18058 needToDelay(const TreeEntry *E,
18060 // No need to delay emission if all deps are ready.
18061 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18062 return all_of(
18063 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18064 }))
18065 return std::nullopt;
18066 // Postpone gather emission, will be emitted after the end of the
18067 // process to keep correct order.
18068 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18069 return Builder.CreateAlignedLoad(
18070 ResVecTy,
18071 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18072 MaybeAlign());
18073 }
18074 /// Reset the builder to handle perfect diamond match.
18076 IsFinalized = false;
18077 CommonMask.clear();
18078 InVectors.clear();
18079 }
18080 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18081 /// shuffling.
18082 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18083 Value *V1 = getVectorizedValue(E1);
18084 Value *V2 = getVectorizedValue(E2);
18085 add(V1, V2, Mask);
18086 }
18087 /// Adds single input vector (in form of tree entry) and the mask for its
18088 /// shuffling.
18089 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18090 Value *V1 = getVectorizedValue(E1);
18091 add(V1, Mask);
18092 }
18093 /// Adds 2 input vectors and the mask for their shuffling.
18094 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18095 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18098 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18099 V1 = castToScalarTyElem(V1);
18100 V2 = castToScalarTyElem(V2);
18101 if (InVectors.empty()) {
18102 InVectors.push_back(V1);
18103 InVectors.push_back(V2);
18104 CommonMask.assign(Mask.begin(), Mask.end());
18105 return;
18106 }
18107 Value *Vec = InVectors.front();
18108 if (InVectors.size() == 2) {
18109 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18110 transformMaskAfterShuffle(CommonMask, CommonMask);
18111 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18112 Mask.size()) {
18113 Vec = createShuffle(Vec, nullptr, CommonMask);
18114 transformMaskAfterShuffle(CommonMask, CommonMask);
18115 }
18116 V1 = createShuffle(V1, V2, Mask);
18117 unsigned VF = std::max(getVF(V1), getVF(Vec));
18118 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18119 if (Mask[Idx] != PoisonMaskElem)
18120 CommonMask[Idx] = Idx + VF;
18121 InVectors.front() = Vec;
18122 if (InVectors.size() == 2)
18123 InVectors.back() = V1;
18124 else
18125 InVectors.push_back(V1);
18126 }
18127 /// Adds another one input vector and the mask for the shuffling.
18128 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
18130 "castToScalarTyElem expects V1 to be FixedVectorType");
18131 V1 = castToScalarTyElem(V1);
18132 if (InVectors.empty()) {
18133 InVectors.push_back(V1);
18134 CommonMask.assign(Mask.begin(), Mask.end());
18135 return;
18136 }
18137 const auto *It = find(InVectors, V1);
18138 if (It == InVectors.end()) {
18139 if (InVectors.size() == 2 ||
18140 InVectors.front()->getType() != V1->getType()) {
18141 Value *V = InVectors.front();
18142 if (InVectors.size() == 2) {
18143 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18144 transformMaskAfterShuffle(CommonMask, CommonMask);
18145 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18146 CommonMask.size()) {
18147 V = createShuffle(InVectors.front(), nullptr, CommonMask);
18148 transformMaskAfterShuffle(CommonMask, CommonMask);
18149 }
18150 unsigned VF = std::max(CommonMask.size(), Mask.size());
18151 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18152 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
18153 CommonMask[Idx] = V->getType() != V1->getType()
18154 ? Idx + VF
18155 : Mask[Idx] + getVF(V1);
18156 if (V->getType() != V1->getType())
18157 V1 = createShuffle(V1, nullptr, Mask);
18158 InVectors.front() = V;
18159 if (InVectors.size() == 2)
18160 InVectors.back() = V1;
18161 else
18162 InVectors.push_back(V1);
18163 return;
18164 }
18165 // Check if second vector is required if the used elements are already
18166 // used from the first one.
18167 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18168 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
18169 InVectors.push_back(V1);
18170 break;
18171 }
18172 }
18173 unsigned VF = 0;
18174 for (Value *V : InVectors)
18175 VF = std::max(VF, getVF(V));
18176 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18177 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
18178 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18179 }
18180 /// Adds another one input vector and the mask for the shuffling.
18182 SmallVector<int> NewMask;
18183 inversePermutation(Order, NewMask);
18184 add(V1, NewMask);
18185 }
18186 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
18187 Value *Root = nullptr) {
18188 return R.gather(VL, Root, ScalarTy,
18189 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
18190 return createShuffle(V1, V2, Mask);
18191 });
18192 }
18193 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
18194 /// Finalize emission of the shuffles.
18195 /// \param Action the action (if any) to be performed before final applying of
18196 /// the \p ExtMask mask.
18198 ArrayRef<int> ExtMask,
18199 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18200 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
18203 Action = {}) {
18204 IsFinalized = true;
18205 if (Action) {
18206 Value *Vec = InVectors.front();
18207 if (InVectors.size() == 2) {
18208 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18209 InVectors.pop_back();
18210 } else {
18211 Vec = createShuffle(Vec, nullptr, CommonMask);
18212 }
18213 transformMaskAfterShuffle(CommonMask, CommonMask);
18214 assert(VF > 0 &&
18215 "Expected vector length for the final value before action.");
18216 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
18217 if (VecVF < VF) {
18218 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18219 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18220 Vec = createShuffle(Vec, nullptr, ResizeMask);
18221 }
18222 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
18223 return createShuffle(V1, V2, Mask);
18224 });
18225 InVectors.front() = Vec;
18226 }
18227 if (!SubVectors.empty()) {
18228 Value *Vec = InVectors.front();
18229 if (InVectors.size() == 2) {
18230 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18231 InVectors.pop_back();
18232 } else {
18233 Vec = createShuffle(Vec, nullptr, CommonMask);
18234 }
18235 transformMaskAfterShuffle(CommonMask, CommonMask);
18236 auto CreateSubVectors = [&](Value *Vec,
18237 SmallVectorImpl<int> &CommonMask) {
18238 for (auto [E, Idx] : SubVectors) {
18239 Value *V = getVectorizedValue(*E);
18240 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
18241 // Use scalar version of the SCalarType to correctly handle shuffles
18242 // for revectorization. The revectorization mode operates by the
18243 // vectors, but here we need to operate on the scalars, because the
18244 // masks were already transformed for the vector elements and we don't
18245 // need doing this transformation again.
18246 Type *OrigScalarTy = ScalarTy;
18247 ScalarTy = ScalarTy->getScalarType();
18248 Vec = createInsertVector(
18249 Builder, Vec, V, InsertionIndex,
18250 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
18251 _3));
18252 ScalarTy = OrigScalarTy;
18253 if (!CommonMask.empty()) {
18254 std::iota(std::next(CommonMask.begin(), Idx),
18255 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
18256 Idx);
18257 }
18258 }
18259 return Vec;
18260 };
18261 if (SubVectorsMask.empty()) {
18262 Vec = CreateSubVectors(Vec, CommonMask);
18263 } else {
18264 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
18265 copy(SubVectorsMask, SVMask.begin());
18266 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
18267 if (I2 != PoisonMaskElem) {
18268 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
18269 I1 = I2 + CommonMask.size();
18270 }
18271 }
18272 Value *InsertVec =
18273 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
18274 Vec = createShuffle(InsertVec, Vec, SVMask);
18275 transformMaskAfterShuffle(CommonMask, SVMask);
18276 }
18277 InVectors.front() = Vec;
18278 }
18279
18280 if (!ExtMask.empty()) {
18281 if (CommonMask.empty()) {
18282 CommonMask.assign(ExtMask.begin(), ExtMask.end());
18283 } else {
18284 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
18285 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
18286 if (ExtMask[I] == PoisonMaskElem)
18287 continue;
18288 NewMask[I] = CommonMask[ExtMask[I]];
18289 }
18290 CommonMask.swap(NewMask);
18291 }
18292 }
18293 if (CommonMask.empty()) {
18294 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
18295 return InVectors.front();
18296 }
18297 if (InVectors.size() == 2)
18298 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18299 return createShuffle(InVectors.front(), nullptr, CommonMask);
18300 }
18301
18303 assert((IsFinalized || CommonMask.empty()) &&
18304 "Shuffle construction must be finalized.");
18305 }
18306};
18307
18308Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
18309 return vectorizeTree(getOperandEntry(E, NodeIdx));
18310}
18311
18312template <typename BVTy, typename ResTy, typename... Args>
18313ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
18314 Args &...Params) {
18315 assert(E->isGather() && "Expected gather node.");
18316 unsigned VF = E->getVectorFactor();
18317
18318 bool NeedFreeze = false;
18319 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
18320 // Clear values, to be replaced by insertvector instructions.
18321 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18322 for_each(MutableArrayRef(GatheredScalars)
18323 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18324 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
18326 E->CombinedEntriesWithIndices.size());
18327 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18328 [&](const auto &P) {
18329 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18330 });
18331 // Build a mask out of the reorder indices and reorder scalars per this
18332 // mask.
18333 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18334 E->ReorderIndices.end());
18335 if (!ReorderMask.empty())
18336 reorderScalars(GatheredScalars, ReorderMask);
18337 SmallVector<int> SubVectorsMask;
18338 inversePermutation(E->ReorderIndices, SubVectorsMask);
18339 // Transform non-clustered elements in the mask to poison (-1).
18340 // "Clustered" operations will be reordered using this mask later.
18341 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
18342 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
18343 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
18344 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
18345 } else {
18346 SubVectorsMask.clear();
18347 }
18348 SmallVector<Value *> StoredGS(GatheredScalars);
18349 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
18350 unsigned I, unsigned SliceSize,
18351 bool IsNotPoisonous) {
18352 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
18353 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18354 }))
18355 return false;
18356 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18357 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18358 if (UserTE->getNumOperands() != 2)
18359 return false;
18360 if (!IsNotPoisonous) {
18361 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18362 [=](const std::unique_ptr<TreeEntry> &TE) {
18363 return TE->UserTreeIndex.UserTE == UserTE &&
18364 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18365 });
18366 if (It == VectorizableTree.end())
18367 return false;
18368 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
18369 if (!(*It)->ReorderIndices.empty()) {
18370 inversePermutation((*It)->ReorderIndices, ReorderMask);
18371 reorderScalars(GS, ReorderMask);
18372 }
18373 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
18374 Value *V0 = std::get<0>(P);
18375 Value *V1 = std::get<1>(P);
18376 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18377 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18378 is_contained(E->Scalars, V1));
18379 }))
18380 return false;
18381 }
18382 int Idx;
18383 if ((Mask.size() < InputVF &&
18384 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
18385 Idx == 0) ||
18386 (Mask.size() == InputVF &&
18387 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
18388 std::iota(
18389 std::next(Mask.begin(), I * SliceSize),
18390 std::next(Mask.begin(),
18391 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18392 0);
18393 } else {
18394 unsigned IVal =
18395 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
18396 std::fill(
18397 std::next(Mask.begin(), I * SliceSize),
18398 std::next(Mask.begin(),
18399 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18400 IVal);
18401 }
18402 return true;
18403 };
18404 BVTy ShuffleBuilder(ScalarTy, Params...);
18405 ResTy Res = ResTy();
18406 SmallVector<int> Mask;
18407 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
18409 Value *ExtractVecBase = nullptr;
18410 bool UseVecBaseAsInput = false;
18413 Type *OrigScalarTy = GatheredScalars.front()->getType();
18414 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
18415 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
18416 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
18417 // Check for gathered extracts.
18418 bool Resized = false;
18419 ExtractShuffles =
18420 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18421 if (!ExtractShuffles.empty()) {
18422 SmallVector<const TreeEntry *> ExtractEntries;
18423 for (auto [Idx, I] : enumerate(ExtractMask)) {
18424 if (I == PoisonMaskElem)
18425 continue;
18426 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
18427 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
18428 !TEs.empty())
18429 ExtractEntries.append(TEs.begin(), TEs.end());
18430 }
18431 if (std::optional<ResTy> Delayed =
18432 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18433 // Delay emission of gathers which are not ready yet.
18434 PostponedGathers.insert(E);
18435 // Postpone gather emission, will be emitted after the end of the
18436 // process to keep correct order.
18437 return *Delayed;
18438 }
18439 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
18440 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18441 ExtractVecBase = VecBase;
18442 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18443 if (VF == VecBaseTy->getNumElements() &&
18444 GatheredScalars.size() != VF) {
18445 Resized = true;
18446 GatheredScalars.append(VF - GatheredScalars.size(),
18447 PoisonValue::get(OrigScalarTy));
18448 NumParts =
18449 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
18450 }
18451 }
18452 }
18453 // Gather extracts after we check for full matched gathers only.
18454 if (!ExtractShuffles.empty() || !E->hasState() ||
18455 E->getOpcode() != Instruction::Load ||
18456 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18457 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18458 any_of(E->Scalars,
18459 [this](Value *V) {
18460 return isa<LoadInst>(V) && isVectorized(V);
18461 })) ||
18462 (E->hasState() && E->isAltShuffle()) ||
18463 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
18464 isSplat(E->Scalars) ||
18465 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18466 GatherShuffles =
18467 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18468 }
18469 if (!GatherShuffles.empty()) {
18470 if (std::optional<ResTy> Delayed =
18471 ShuffleBuilder.needToDelay(E, Entries)) {
18472 // Delay emission of gathers which are not ready yet.
18473 PostponedGathers.insert(E);
18474 // Postpone gather emission, will be emitted after the end of the
18475 // process to keep correct order.
18476 return *Delayed;
18477 }
18478 if (GatherShuffles.size() == 1 &&
18479 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
18480 Entries.front().front()->isSame(E->Scalars)) {
18481 // Perfect match in the graph, will reuse the previously vectorized
18482 // node. Cost is 0.
18483 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
18484 << shortBundleName(E->Scalars, E->Idx) << ".\n");
18485 // Restore the mask for previous partially matched values.
18486 Mask.resize(E->Scalars.size());
18487 const TreeEntry *FrontTE = Entries.front().front();
18488 if (FrontTE->ReorderIndices.empty() &&
18489 ((FrontTE->ReuseShuffleIndices.empty() &&
18490 E->Scalars.size() == FrontTE->Scalars.size()) ||
18491 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18492 std::iota(Mask.begin(), Mask.end(), 0);
18493 } else {
18494 for (auto [I, V] : enumerate(E->Scalars)) {
18495 if (isa<PoisonValue>(V)) {
18497 continue;
18498 }
18499 Mask[I] = FrontTE->findLaneForValue(V);
18500 }
18501 }
18502 // Reset the builder(s) to correctly handle perfect diamond matched
18503 // nodes.
18504 ShuffleBuilder.resetForSameNode();
18505 ShuffleBuilder.add(*FrontTE, Mask);
18506 // Full matched entry found, no need to insert subvectors.
18507 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18508 return Res;
18509 }
18510 if (!Resized) {
18511 if (GatheredScalars.size() != VF &&
18512 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
18513 return any_of(TEs, [&](const TreeEntry *TE) {
18514 return TE->getVectorFactor() == VF;
18515 });
18516 }))
18517 GatheredScalars.append(VF - GatheredScalars.size(),
18518 PoisonValue::get(OrigScalarTy));
18519 }
18520 // Remove shuffled elements from list of gathers.
18521 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18522 if (Mask[I] != PoisonMaskElem)
18523 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18524 }
18525 }
18526 }
18527 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18528 SmallVectorImpl<int> &ReuseMask,
18529 bool IsRootPoison) {
18530 // For splats with can emit broadcasts instead of gathers, so try to find
18531 // such sequences.
18532 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
18533 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
18534 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
18535 SmallVector<int> UndefPos;
18536 DenseMap<Value *, unsigned> UniquePositions;
18537 // Gather unique non-const values and all constant values.
18538 // For repeated values, just shuffle them.
18539 int NumNonConsts = 0;
18540 int SinglePos = 0;
18541 for (auto [I, V] : enumerate(Scalars)) {
18542 if (isa<UndefValue>(V)) {
18543 if (!isa<PoisonValue>(V)) {
18544 ReuseMask[I] = I;
18545 UndefPos.push_back(I);
18546 }
18547 continue;
18548 }
18549 if (isConstant(V)) {
18550 ReuseMask[I] = I;
18551 continue;
18552 }
18553 ++NumNonConsts;
18554 SinglePos = I;
18555 Value *OrigV = V;
18556 Scalars[I] = PoisonValue::get(OrigScalarTy);
18557 if (IsSplat) {
18558 Scalars.front() = OrigV;
18559 ReuseMask[I] = 0;
18560 } else {
18561 const auto Res = UniquePositions.try_emplace(OrigV, I);
18562 Scalars[Res.first->second] = OrigV;
18563 ReuseMask[I] = Res.first->second;
18564 }
18565 }
18566 if (NumNonConsts == 1) {
18567 // Restore single insert element.
18568 if (IsSplat) {
18569 ReuseMask.assign(VF, PoisonMaskElem);
18570 std::swap(Scalars.front(), Scalars[SinglePos]);
18571 if (!UndefPos.empty() && UndefPos.front() == 0)
18572 Scalars.front() = UndefValue::get(OrigScalarTy);
18573 }
18574 ReuseMask[SinglePos] = SinglePos;
18575 } else if (!UndefPos.empty() && IsSplat) {
18576 // For undef values, try to replace them with the simple broadcast.
18577 // We can do it if the broadcasted value is guaranteed to be
18578 // non-poisonous, or by freezing the incoming scalar value first.
18579 auto *It = find_if(Scalars, [this, E](Value *V) {
18580 return !isa<UndefValue>(V) &&
18582 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
18583 // Check if the value already used in the same operation in
18584 // one of the nodes already.
18585 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18586 is_contained(E->UserTreeIndex.UserTE->Scalars,
18587 U.getUser());
18588 })));
18589 });
18590 if (It != Scalars.end()) {
18591 // Replace undefs by the non-poisoned scalars and emit broadcast.
18592 int Pos = std::distance(Scalars.begin(), It);
18593 for (int I : UndefPos) {
18594 // Set the undef position to the non-poisoned scalar.
18595 ReuseMask[I] = Pos;
18596 // Replace the undef by the poison, in the mask it is replaced by
18597 // non-poisoned scalar already.
18598 if (I != Pos)
18599 Scalars[I] = PoisonValue::get(OrigScalarTy);
18600 }
18601 } else {
18602 // Replace undefs by the poisons, emit broadcast and then emit
18603 // freeze.
18604 for (int I : UndefPos) {
18605 ReuseMask[I] = PoisonMaskElem;
18606 if (isa<UndefValue>(Scalars[I]))
18607 Scalars[I] = PoisonValue::get(OrigScalarTy);
18608 }
18609 NeedFreeze = true;
18610 }
18611 }
18612 };
18613 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
18614 bool IsNonPoisoned = true;
18615 bool IsUsedInExpr = true;
18616 Value *Vec1 = nullptr;
18617 if (!ExtractShuffles.empty()) {
18618 // Gather of extractelements can be represented as just a shuffle of
18619 // a single/two vectors the scalars are extracted from.
18620 // Find input vectors.
18621 Value *Vec2 = nullptr;
18622 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18623 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
18624 ExtractMask[I] = PoisonMaskElem;
18625 }
18626 if (UseVecBaseAsInput) {
18627 Vec1 = ExtractVecBase;
18628 } else {
18629 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18630 if (ExtractMask[I] == PoisonMaskElem)
18631 continue;
18632 if (isa<UndefValue>(StoredGS[I]))
18633 continue;
18634 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
18635 Value *VecOp = EI->getVectorOperand();
18636 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
18637 !TEs.empty() && TEs.front()->VectorizedValue)
18638 VecOp = TEs.front()->VectorizedValue;
18639 if (!Vec1) {
18640 Vec1 = VecOp;
18641 } else if (Vec1 != VecOp) {
18642 assert((!Vec2 || Vec2 == VecOp) &&
18643 "Expected only 1 or 2 vectors shuffle.");
18644 Vec2 = VecOp;
18645 }
18646 }
18647 }
18648 if (Vec2) {
18649 IsUsedInExpr = false;
18650 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
18651 isGuaranteedNotToBePoison(Vec2, AC);
18652 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18653 } else if (Vec1) {
18654 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
18655 IsUsedInExpr &= FindReusedSplat(
18656 ExtractMask,
18657 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
18658 ExtractMask.size(), IsNotPoisonedVec);
18659 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
18660 IsNonPoisoned &= IsNotPoisonedVec;
18661 } else {
18662 IsUsedInExpr = false;
18663 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
18664 /*ForExtracts=*/true);
18665 }
18666 }
18667 if (!GatherShuffles.empty()) {
18668 unsigned SliceSize =
18669 getPartNumElems(E->Scalars.size(),
18670 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
18671 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18672 for (const auto [I, TEs] : enumerate(Entries)) {
18673 if (TEs.empty()) {
18674 assert(!GatherShuffles[I] &&
18675 "No shuffles with empty entries list expected.");
18676 continue;
18677 }
18678 assert((TEs.size() == 1 || TEs.size() == 2) &&
18679 "Expected shuffle of 1 or 2 entries.");
18680 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
18681 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
18682 VecMask.assign(VecMask.size(), PoisonMaskElem);
18683 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
18684 if (TEs.size() == 1) {
18685 bool IsNotPoisonedVec =
18686 TEs.front()->VectorizedValue
18687 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
18688 : true;
18689 IsUsedInExpr &=
18690 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
18691 SliceSize, IsNotPoisonedVec);
18692 ShuffleBuilder.add(*TEs.front(), VecMask);
18693 IsNonPoisoned &= IsNotPoisonedVec;
18694 } else {
18695 IsUsedInExpr = false;
18696 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
18697 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
18698 IsNonPoisoned &=
18699 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
18700 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
18701 }
18702 }
18703 }
18704 // Try to figure out best way to combine values: build a shuffle and insert
18705 // elements or just build several shuffles.
18706 // Insert non-constant scalars.
18707 SmallVector<Value *> NonConstants(GatheredScalars);
18708 int EMSz = ExtractMask.size();
18709 int MSz = Mask.size();
18710 // Try to build constant vector and shuffle with it only if currently we
18711 // have a single permutation and more than 1 scalar constants.
18712 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
18713 bool IsIdentityShuffle =
18714 ((UseVecBaseAsInput ||
18715 all_of(ExtractShuffles,
18716 [](const std::optional<TTI::ShuffleKind> &SK) {
18717 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18719 })) &&
18720 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
18721 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
18722 (!GatherShuffles.empty() &&
18723 all_of(GatherShuffles,
18724 [](const std::optional<TTI::ShuffleKind> &SK) {
18725 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18727 }) &&
18728 none_of(Mask, [&](int I) { return I >= MSz; }) &&
18730 bool EnoughConstsForShuffle =
18731 IsSingleShuffle &&
18732 (none_of(GatheredScalars,
18733 [](Value *V) {
18734 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18735 }) ||
18736 any_of(GatheredScalars,
18737 [](Value *V) {
18738 return isa<Constant>(V) && !isa<UndefValue>(V);
18739 })) &&
18740 (!IsIdentityShuffle ||
18741 (GatheredScalars.size() == 2 &&
18742 any_of(GatheredScalars,
18743 [](Value *V) { return !isa<UndefValue>(V); })) ||
18744 count_if(GatheredScalars, [](Value *V) {
18745 return isa<Constant>(V) && !isa<PoisonValue>(V);
18746 }) > 1);
18747 // NonConstants array contains just non-constant values, GatheredScalars
18748 // contains only constant to build final vector and then shuffle.
18749 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
18750 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
18751 NonConstants[I] = PoisonValue::get(OrigScalarTy);
18752 else
18753 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18754 }
18755 // Generate constants for final shuffle and build a mask for them.
18756 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
18757 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
18758 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
18759 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18760 ShuffleBuilder.add(BV, BVMask);
18761 }
18762 if (all_of(NonConstants, [=](Value *V) {
18763 return isa<PoisonValue>(V) ||
18764 (IsSingleShuffle && ((IsIdentityShuffle &&
18765 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
18766 }))
18767 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18768 SubVectorsMask);
18769 else
18770 Res = ShuffleBuilder.finalize(
18771 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
18772 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
18773 bool IsSplat = isSplat(NonConstants);
18774 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18775 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
18776 auto CheckIfSplatIsProfitable = [&]() {
18777 // Estimate the cost of splatting + shuffle and compare with
18778 // insert + shuffle.
18779 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18780 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18781 if (isa<ExtractElementInst>(V) || isVectorized(V))
18782 return false;
18783 InstructionCost SplatCost = TTI->getVectorInstrCost(
18784 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
18785 PoisonValue::get(VecTy), V);
18786 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18787 for (auto [Idx, I] : enumerate(BVMask))
18788 if (I != PoisonMaskElem)
18789 NewMask[Idx] = Mask.size();
18790 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18791 NewMask, CostKind);
18792 InstructionCost BVCost = TTI->getVectorInstrCost(
18793 Instruction::InsertElement, VecTy, CostKind,
18794 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18795 Vec, V);
18796 // Shuffle required?
18797 if (count(BVMask, PoisonMaskElem) <
18798 static_cast<int>(BVMask.size() - 1)) {
18799 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18800 for (auto [Idx, I] : enumerate(BVMask))
18801 if (I != PoisonMaskElem)
18802 NewMask[Idx] = I;
18803 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18804 VecTy, NewMask, CostKind);
18805 }
18806 return SplatCost <= BVCost;
18807 };
18808 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18809 for (auto [Idx, I] : enumerate(BVMask))
18810 if (I != PoisonMaskElem)
18811 Mask[Idx] = I;
18812 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
18813 } else {
18814 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18815 SmallVector<Value *> Values(NonConstants.size(),
18816 PoisonValue::get(ScalarTy));
18817 Values[0] = V;
18818 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18819 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
18820 transform(BVMask, SplatMask.begin(), [](int I) {
18821 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18822 });
18823 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
18824 BV = CreateShuffle(BV, nullptr, SplatMask);
18825 for (auto [Idx, I] : enumerate(BVMask))
18826 if (I != PoisonMaskElem)
18827 Mask[Idx] = BVMask.size() + Idx;
18828 Vec = CreateShuffle(Vec, BV, Mask);
18829 for (auto [Idx, I] : enumerate(Mask))
18830 if (I != PoisonMaskElem)
18831 Mask[Idx] = Idx;
18832 }
18833 });
18834 } else if (!allConstant(GatheredScalars)) {
18835 // Gather unique scalars and all constants.
18836 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
18837 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
18838 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
18839 ShuffleBuilder.add(BV, ReuseMask);
18840 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18841 SubVectorsMask);
18842 } else {
18843 // Gather all constants.
18844 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
18845 for (auto [I, V] : enumerate(GatheredScalars)) {
18846 if (!isa<PoisonValue>(V))
18847 Mask[I] = I;
18848 }
18849 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18850 ShuffleBuilder.add(BV, Mask);
18851 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18852 SubVectorsMask);
18853 }
18854
18855 if (NeedFreeze)
18856 Res = ShuffleBuilder.createFreeze(Res);
18857 return Res;
18858}
18859
18860Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
18861 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
18862 (void)vectorizeTree(VectorizableTree[EIdx].get());
18863 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
18864 Builder, *this);
18865}
18866
18867/// \returns \p I after propagating metadata from \p VL only for instructions in
18868/// \p VL.
18871 for (Value *V : VL)
18872 if (isa<Instruction>(V))
18873 Insts.push_back(V);
18874 return llvm::propagateMetadata(Inst, Insts);
18875}
18876
18878 if (DebugLoc DL = PN.getDebugLoc())
18879 return DL;
18880 return DebugLoc::getUnknown();
18881}
18882
18883Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
18884 IRBuilderBase::InsertPointGuard Guard(Builder);
18885
18886 Value *V = E->Scalars.front();
18887 Type *ScalarTy = V->getType();
18888 if (!isa<CmpInst>(V))
18889 ScalarTy = getValueType(V);
18890 auto It = MinBWs.find(E);
18891 if (It != MinBWs.end()) {
18892 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
18893 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
18894 if (VecTy)
18895 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
18896 }
18897 if (E->VectorizedValue)
18898 return E->VectorizedValue;
18899 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
18900 if (E->isGather()) {
18901 // Set insert point for non-reduction initial nodes.
18902 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
18903 setInsertPointAfterBundle(E);
18904 Value *Vec = createBuildVector(E, ScalarTy);
18905 E->VectorizedValue = Vec;
18906 return Vec;
18907 }
18908 if (E->State == TreeEntry::SplitVectorize) {
18909 assert(E->CombinedEntriesWithIndices.size() == 2 &&
18910 "Expected exactly 2 combined entries.");
18911 setInsertPointAfterBundle(E);
18912 TreeEntry &OpTE1 =
18913 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
18914 assert(OpTE1.isSame(
18915 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
18916 "Expected same first part of scalars.");
18917 Value *Op1 = vectorizeTree(&OpTE1);
18918 TreeEntry &OpTE2 =
18919 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
18920 assert(
18921 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
18922 "Expected same second part of scalars.");
18923 Value *Op2 = vectorizeTree(&OpTE2);
18924 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
18925 bool IsSigned = false;
18926 auto It = MinBWs.find(OpE);
18927 if (It != MinBWs.end())
18928 IsSigned = It->second.second;
18929 else
18930 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
18931 if (isa<PoisonValue>(V))
18932 return false;
18933 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18934 });
18935 return IsSigned;
18936 };
18937 if (cast<VectorType>(Op1->getType())->getElementType() !=
18938 ScalarTy->getScalarType()) {
18939 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
18940 Op1 = Builder.CreateIntCast(
18941 Op1,
18943 ScalarTy,
18944 cast<FixedVectorType>(Op1->getType())->getNumElements()),
18945 GetOperandSignedness(&OpTE1));
18946 }
18947 if (cast<VectorType>(Op2->getType())->getElementType() !=
18948 ScalarTy->getScalarType()) {
18949 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
18950 Op2 = Builder.CreateIntCast(
18951 Op2,
18953 ScalarTy,
18954 cast<FixedVectorType>(Op2->getType())->getNumElements()),
18955 GetOperandSignedness(&OpTE2));
18956 }
18957 if (E->ReorderIndices.empty()) {
18958 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
18959 std::iota(
18960 Mask.begin(),
18961 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
18962 0);
18963 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18964 if (ScalarTyNumElements != 1) {
18965 assert(SLPReVec && "Only supported by REVEC.");
18966 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
18967 }
18968 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
18969 Vec = createInsertVector(Builder, Vec, Op2,
18970 E->CombinedEntriesWithIndices.back().second *
18971 ScalarTyNumElements);
18972 E->VectorizedValue = Vec;
18973 return Vec;
18974 }
18975 unsigned CommonVF =
18976 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
18977 if (getNumElements(Op1->getType()) != CommonVF) {
18978 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
18979 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
18980 0);
18981 Op1 = Builder.CreateShuffleVector(Op1, Mask);
18982 }
18983 if (getNumElements(Op2->getType()) != CommonVF) {
18984 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
18985 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
18986 0);
18987 Op2 = Builder.CreateShuffleVector(Op2, Mask);
18988 }
18989 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
18990 E->VectorizedValue = Vec;
18991 return Vec;
18992 }
18993
18994 bool IsReverseOrder =
18995 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
18996 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
18997 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
18998 if (E->getOpcode() == Instruction::Store &&
18999 E->State == TreeEntry::Vectorize) {
19000 ArrayRef<int> Mask =
19001 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19002 E->ReorderIndices.size());
19003 ShuffleBuilder.add(V, Mask);
19004 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19005 E->State == TreeEntry::CompressVectorize) {
19006 ShuffleBuilder.addOrdered(V, {});
19007 } else {
19008 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19009 }
19011 E->CombinedEntriesWithIndices.size());
19012 transform(
19013 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19014 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19015 });
19016 assert(
19017 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19018 "Expected either combined subnodes or reordering");
19019 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19020 };
19021
19022 assert(!E->isGather() && "Unhandled state");
19023 unsigned ShuffleOrOp =
19024 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19025 Instruction *VL0 = E->getMainOp();
19026 auto GetOperandSignedness = [&](unsigned Idx) {
19027 const TreeEntry *OpE = getOperandEntry(E, Idx);
19028 bool IsSigned = false;
19029 auto It = MinBWs.find(OpE);
19030 if (It != MinBWs.end())
19031 IsSigned = It->second.second;
19032 else
19033 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19034 if (isa<PoisonValue>(V))
19035 return false;
19036 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19037 });
19038 return IsSigned;
19039 };
19040 switch (ShuffleOrOp) {
19041 case Instruction::PHI: {
19042 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19043 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19044 "PHI reordering is free.");
19045 auto *PH = cast<PHINode>(VL0);
19046 Builder.SetInsertPoint(PH->getParent(),
19047 PH->getParent()->getFirstNonPHIIt());
19048 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19049 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19050 Value *V = NewPhi;
19051
19052 // Adjust insertion point once all PHI's have been generated.
19053 Builder.SetInsertPoint(PH->getParent(),
19054 PH->getParent()->getFirstInsertionPt());
19055 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19056
19057 V = FinalShuffle(V, E);
19058
19059 E->VectorizedValue = V;
19060 // If phi node is fully emitted - exit.
19061 if (NewPhi->getNumIncomingValues() != 0)
19062 return NewPhi;
19063
19064 // PHINodes may have multiple entries from the same block. We want to
19065 // visit every block once.
19066 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19067
19068 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19069 BasicBlock *IBB = PH->getIncomingBlock(I);
19070
19071 // Stop emission if all incoming values are generated.
19072 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19073 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19074 return NewPhi;
19075 }
19076
19077 if (!VisitedBBs.insert(IBB).second) {
19078 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19079 NewPhi->addIncoming(VecOp, IBB);
19080 TreeEntry *OpTE = getOperandEntry(E, I);
19081 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19082 OpTE->VectorizedValue = VecOp;
19083 continue;
19084 }
19085
19086 Builder.SetInsertPoint(IBB->getTerminator());
19087 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19088 Value *Vec = vectorizeOperand(E, I);
19089 if (VecTy != Vec->getType()) {
19090 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19091 MinBWs.contains(getOperandEntry(E, I))) &&
19092 "Expected item in MinBWs.");
19093 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19094 }
19095 NewPhi->addIncoming(Vec, IBB);
19096 }
19097
19098 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19099 "Invalid number of incoming values");
19100 assert(E->VectorizedValue && "Expected vectorized value.");
19101 return E->VectorizedValue;
19102 }
19103
19104 case Instruction::ExtractElement: {
19105 Value *V = E->getSingleOperand(0);
19106 setInsertPointAfterBundle(E);
19107 V = FinalShuffle(V, E);
19108 E->VectorizedValue = V;
19109 return V;
19110 }
19111 case Instruction::ExtractValue: {
19112 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19113 Builder.SetInsertPoint(LI);
19114 Value *Ptr = LI->getPointerOperand();
19115 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19116 Value *NewV = ::propagateMetadata(V, E->Scalars);
19117 NewV = FinalShuffle(NewV, E);
19118 E->VectorizedValue = NewV;
19119 return NewV;
19120 }
19121 case Instruction::InsertElement: {
19122 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
19123 if (const TreeEntry *OpE = getOperandEntry(E, 1);
19124 OpE && !OpE->isGather() && OpE->hasState() &&
19125 !OpE->hasCopyableElements())
19126 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
19127 else
19128 setInsertPointAfterBundle(E);
19129 Value *V = vectorizeOperand(E, 1);
19130 ArrayRef<Value *> Op = E->getOperand(1);
19131 Type *ScalarTy = Op.front()->getType();
19132 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
19133 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19134 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
19135 assert(Res.first > 0 && "Expected item in MinBWs.");
19136 V = Builder.CreateIntCast(
19137 V,
19139 ScalarTy,
19140 cast<FixedVectorType>(V->getType())->getNumElements()),
19141 Res.second);
19142 }
19143
19144 // Create InsertVector shuffle if necessary
19145 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
19146 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19147 }));
19148 const unsigned NumElts =
19149 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19150 const unsigned NumScalars = E->Scalars.size();
19151
19152 unsigned Offset = *getElementIndex(VL0);
19153 assert(Offset < NumElts && "Failed to find vector index offset");
19154
19155 // Create shuffle to resize vector
19156 SmallVector<int> Mask;
19157 if (!E->ReorderIndices.empty()) {
19158 inversePermutation(E->ReorderIndices, Mask);
19159 Mask.append(NumElts - NumScalars, PoisonMaskElem);
19160 } else {
19161 Mask.assign(NumElts, PoisonMaskElem);
19162 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
19163 }
19164 // Create InsertVector shuffle if necessary
19165 bool IsIdentity = true;
19166 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
19167 Mask.swap(PrevMask);
19168 for (unsigned I = 0; I < NumScalars; ++I) {
19169 Value *Scalar = E->Scalars[PrevMask[I]];
19170 unsigned InsertIdx = *getElementIndex(Scalar);
19171 IsIdentity &= InsertIdx - Offset == I;
19172 Mask[InsertIdx - Offset] = I;
19173 }
19174 if (!IsIdentity || NumElts != NumScalars) {
19175 Value *V2 = nullptr;
19176 bool IsVNonPoisonous =
19178 SmallVector<int> InsertMask(Mask);
19179 if (NumElts != NumScalars && Offset == 0) {
19180 // Follow all insert element instructions from the current buildvector
19181 // sequence.
19182 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
19183 do {
19184 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
19185 if (!InsertIdx)
19186 break;
19187 if (InsertMask[*InsertIdx] == PoisonMaskElem)
19188 InsertMask[*InsertIdx] = *InsertIdx;
19189 if (!Ins->hasOneUse())
19190 break;
19192 Ins->getUniqueUndroppableUser());
19193 } while (Ins);
19194 SmallBitVector UseMask =
19195 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19196 SmallBitVector IsFirstPoison =
19197 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19198 SmallBitVector IsFirstUndef =
19199 isUndefVector(FirstInsert->getOperand(0), UseMask);
19200 if (!IsFirstPoison.all()) {
19201 unsigned Idx = 0;
19202 for (unsigned I = 0; I < NumElts; I++) {
19203 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
19204 IsFirstUndef.test(I)) {
19205 if (IsVNonPoisonous) {
19206 InsertMask[I] = I < NumScalars ? I : 0;
19207 continue;
19208 }
19209 if (!V2)
19210 V2 = UndefValue::get(V->getType());
19211 if (Idx >= NumScalars)
19212 Idx = NumScalars - 1;
19213 InsertMask[I] = NumScalars + Idx;
19214 ++Idx;
19215 } else if (InsertMask[I] != PoisonMaskElem &&
19216 Mask[I] == PoisonMaskElem) {
19217 InsertMask[I] = PoisonMaskElem;
19218 }
19219 }
19220 } else {
19221 InsertMask = Mask;
19222 }
19223 }
19224 if (!V2)
19225 V2 = PoisonValue::get(V->getType());
19226 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19227 if (auto *I = dyn_cast<Instruction>(V)) {
19228 GatherShuffleExtractSeq.insert(I);
19229 CSEBlocks.insert(I->getParent());
19230 }
19231 }
19232
19233 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
19234 for (unsigned I = 0; I < NumElts; I++) {
19235 if (Mask[I] != PoisonMaskElem)
19236 InsertMask[Offset + I] = I;
19237 }
19238 SmallBitVector UseMask =
19239 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19240 SmallBitVector IsFirstUndef =
19241 isUndefVector(FirstInsert->getOperand(0), UseMask);
19242 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
19243 NumElts != NumScalars) {
19244 if (IsFirstUndef.all()) {
19245 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
19246 SmallBitVector IsFirstPoison =
19247 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19248 if (!IsFirstPoison.all()) {
19249 for (unsigned I = 0; I < NumElts; I++) {
19250 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
19251 InsertMask[I] = I + NumElts;
19252 }
19253 }
19254 V = Builder.CreateShuffleVector(
19255 V,
19256 IsFirstPoison.all() ? PoisonValue::get(V->getType())
19257 : FirstInsert->getOperand(0),
19258 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
19259 if (auto *I = dyn_cast<Instruction>(V)) {
19260 GatherShuffleExtractSeq.insert(I);
19261 CSEBlocks.insert(I->getParent());
19262 }
19263 }
19264 } else {
19265 SmallBitVector IsFirstPoison =
19266 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19267 for (unsigned I = 0; I < NumElts; I++) {
19268 if (InsertMask[I] == PoisonMaskElem)
19269 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
19270 else
19271 InsertMask[I] += NumElts;
19272 }
19273 V = Builder.CreateShuffleVector(
19274 FirstInsert->getOperand(0), V, InsertMask,
19275 cast<Instruction>(E->Scalars.back())->getName());
19276 if (auto *I = dyn_cast<Instruction>(V)) {
19277 GatherShuffleExtractSeq.insert(I);
19278 CSEBlocks.insert(I->getParent());
19279 }
19280 }
19281 }
19282
19283 ++NumVectorInstructions;
19284 E->VectorizedValue = V;
19285 return V;
19286 }
19287 case Instruction::ZExt:
19288 case Instruction::SExt:
19289 case Instruction::FPToUI:
19290 case Instruction::FPToSI:
19291 case Instruction::FPExt:
19292 case Instruction::PtrToInt:
19293 case Instruction::IntToPtr:
19294 case Instruction::SIToFP:
19295 case Instruction::UIToFP:
19296 case Instruction::Trunc:
19297 case Instruction::FPTrunc:
19298 case Instruction::BitCast: {
19299 setInsertPointAfterBundle(E);
19300
19301 Value *InVec = vectorizeOperand(E, 0);
19302
19303 auto *CI = cast<CastInst>(VL0);
19304 Instruction::CastOps VecOpcode = CI->getOpcode();
19305 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
19306 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
19307 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
19308 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19309 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
19310 // Check if the values are candidates to demote.
19311 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19312 if (SrcIt != MinBWs.end())
19313 SrcBWSz = SrcIt->second.first;
19314 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
19315 if (BWSz == SrcBWSz) {
19316 VecOpcode = Instruction::BitCast;
19317 } else if (BWSz < SrcBWSz) {
19318 VecOpcode = Instruction::Trunc;
19319 } else if (It != MinBWs.end()) {
19320 assert(BWSz > SrcBWSz && "Invalid cast!");
19321 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19322 } else if (SrcIt != MinBWs.end()) {
19323 assert(BWSz > SrcBWSz && "Invalid cast!");
19324 VecOpcode =
19325 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19326 }
19327 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19328 !SrcIt->second.second) {
19329 VecOpcode = Instruction::UIToFP;
19330 }
19331 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19332 ? InVec
19333 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19334 V = FinalShuffle(V, E);
19335
19336 E->VectorizedValue = V;
19337 ++NumVectorInstructions;
19338 return V;
19339 }
19340 case Instruction::FCmp:
19341 case Instruction::ICmp: {
19342 setInsertPointAfterBundle(E);
19343
19344 Value *L = vectorizeOperand(E, 0);
19345 Value *R = vectorizeOperand(E, 1);
19346 if (L->getType() != R->getType()) {
19347 assert((getOperandEntry(E, 0)->isGather() ||
19348 getOperandEntry(E, 1)->isGather() ||
19349 MinBWs.contains(getOperandEntry(E, 0)) ||
19350 MinBWs.contains(getOperandEntry(E, 1))) &&
19351 "Expected item in MinBWs.");
19352 if (cast<VectorType>(L->getType())
19353 ->getElementType()
19354 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
19355 ->getElementType()
19356 ->getIntegerBitWidth()) {
19357 Type *CastTy = R->getType();
19358 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19359 } else {
19360 Type *CastTy = L->getType();
19361 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19362 }
19363 }
19364
19365 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
19366 Value *V = Builder.CreateCmp(P0, L, R);
19367 propagateIRFlags(V, E->Scalars, VL0);
19368 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
19369 ICmp->setSameSign(/*B=*/false);
19370 // Do not cast for cmps.
19371 VecTy = cast<FixedVectorType>(V->getType());
19372 V = FinalShuffle(V, E);
19373
19374 E->VectorizedValue = V;
19375 ++NumVectorInstructions;
19376 return V;
19377 }
19378 case Instruction::Select: {
19379 setInsertPointAfterBundle(E);
19380
19381 Value *Cond = vectorizeOperand(E, 0);
19382 Value *True = vectorizeOperand(E, 1);
19383 Value *False = vectorizeOperand(E, 2);
19384 if (True->getType() != VecTy || False->getType() != VecTy) {
19385 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
19386 getOperandEntry(E, 2)->isGather() ||
19387 MinBWs.contains(getOperandEntry(E, 1)) ||
19388 MinBWs.contains(getOperandEntry(E, 2))) &&
19389 "Expected item in MinBWs.");
19390 if (True->getType() != VecTy)
19391 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19392 if (False->getType() != VecTy)
19393 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19394 }
19395
19396 unsigned CondNumElements = getNumElements(Cond->getType());
19397 unsigned TrueNumElements = getNumElements(True->getType());
19398 assert(TrueNumElements >= CondNumElements &&
19399 TrueNumElements % CondNumElements == 0 &&
19400 "Cannot vectorize Instruction::Select");
19401 assert(TrueNumElements == getNumElements(False->getType()) &&
19402 "Cannot vectorize Instruction::Select");
19403 if (CondNumElements != TrueNumElements) {
19404 // When the return type is i1 but the source is fixed vector type, we
19405 // need to duplicate the condition value.
19406 Cond = Builder.CreateShuffleVector(
19407 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
19408 CondNumElements));
19409 }
19410 assert(getNumElements(Cond->getType()) == TrueNumElements &&
19411 "Cannot vectorize Instruction::Select");
19412 Value *V = Builder.CreateSelect(Cond, True, False);
19413 V = FinalShuffle(V, E);
19414
19415 E->VectorizedValue = V;
19416 ++NumVectorInstructions;
19417 return V;
19418 }
19419 case Instruction::FNeg: {
19420 setInsertPointAfterBundle(E);
19421
19422 Value *Op = vectorizeOperand(E, 0);
19423
19424 Value *V = Builder.CreateUnOp(
19425 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
19426 propagateIRFlags(V, E->Scalars, VL0);
19427 if (auto *I = dyn_cast<Instruction>(V))
19428 V = ::propagateMetadata(I, E->Scalars);
19429
19430 V = FinalShuffle(V, E);
19431
19432 E->VectorizedValue = V;
19433 ++NumVectorInstructions;
19434
19435 return V;
19436 }
19437 case Instruction::Freeze: {
19438 setInsertPointAfterBundle(E);
19439
19440 Value *Op = vectorizeOperand(E, 0);
19441
19442 if (Op->getType() != VecTy) {
19443 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19444 MinBWs.contains(getOperandEntry(E, 0))) &&
19445 "Expected item in MinBWs.");
19446 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
19447 }
19448 Value *V = Builder.CreateFreeze(Op);
19449 V = FinalShuffle(V, E);
19450
19451 E->VectorizedValue = V;
19452 ++NumVectorInstructions;
19453
19454 return V;
19455 }
19456 case Instruction::Add:
19457 case Instruction::FAdd:
19458 case Instruction::Sub:
19459 case Instruction::FSub:
19460 case Instruction::Mul:
19461 case Instruction::FMul:
19462 case Instruction::UDiv:
19463 case Instruction::SDiv:
19464 case Instruction::FDiv:
19465 case Instruction::URem:
19466 case Instruction::SRem:
19467 case Instruction::FRem:
19468 case Instruction::Shl:
19469 case Instruction::LShr:
19470 case Instruction::AShr:
19471 case Instruction::And:
19472 case Instruction::Or:
19473 case Instruction::Xor: {
19474 setInsertPointAfterBundle(E);
19475
19476 Value *LHS = vectorizeOperand(E, 0);
19477 Value *RHS = vectorizeOperand(E, 1);
19478 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19479 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19480 ArrayRef<Value *> Ops = E->getOperand(I);
19481 if (all_of(Ops, [&](Value *Op) {
19482 auto *CI = dyn_cast<ConstantInt>(Op);
19483 return CI && CI->getValue().countr_one() >= It->second.first;
19484 })) {
19485 V = FinalShuffle(I == 0 ? RHS : LHS, E);
19486 E->VectorizedValue = V;
19487 ++NumVectorInstructions;
19488 return V;
19489 }
19490 }
19491 }
19492 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
19493 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19494 getOperandEntry(E, 1)->isGather() ||
19495 MinBWs.contains(getOperandEntry(E, 0)) ||
19496 MinBWs.contains(getOperandEntry(E, 1))) &&
19497 "Expected item in MinBWs.");
19498 if (LHS->getType() != VecTy)
19499 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
19500 if (RHS->getType() != VecTy)
19501 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
19502 }
19503
19504 Value *V = Builder.CreateBinOp(
19505 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
19506 RHS);
19507 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
19508 if (auto *I = dyn_cast<Instruction>(V)) {
19509 V = ::propagateMetadata(I, E->Scalars);
19510 // Drop nuw flags for abs(sub(commutative), true).
19511 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19512 any_of(E->Scalars, [](Value *V) {
19513 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19514 }))
19515 I->setHasNoUnsignedWrap(/*b=*/false);
19516 }
19517
19518 V = FinalShuffle(V, E);
19519
19520 E->VectorizedValue = V;
19521 ++NumVectorInstructions;
19522
19523 return V;
19524 }
19525 case Instruction::Load: {
19526 // Loads are inserted at the head of the tree because we don't want to
19527 // sink them all the way down past store instructions.
19528 setInsertPointAfterBundle(E);
19529
19530 LoadInst *LI = cast<LoadInst>(VL0);
19531 Instruction *NewLI;
19532 FixedVectorType *StridedLoadTy = nullptr;
19533 Value *PO = LI->getPointerOperand();
19534 if (E->State == TreeEntry::Vectorize) {
19535 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19536 } else if (E->State == TreeEntry::CompressVectorize) {
19537 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19538 CompressEntryToData.at(E);
19539 Align CommonAlignment = LI->getAlign();
19540 if (IsMasked) {
19541 unsigned VF = getNumElements(LoadVecTy);
19542 SmallVector<Constant *> MaskValues(
19543 VF / getNumElements(LI->getType()),
19544 ConstantInt::getFalse(VecTy->getContext()));
19545 for (int I : CompressMask)
19546 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
19547 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19548 assert(SLPReVec && "Only supported by REVEC.");
19549 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
19550 }
19551 Constant *MaskValue = ConstantVector::get(MaskValues);
19552 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19553 MaskValue);
19554 } else {
19555 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19556 }
19557 NewLI = ::propagateMetadata(NewLI, E->Scalars);
19558 // TODO: include this cost into CommonCost.
19559 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19560 assert(SLPReVec && "FixedVectorType is not expected.");
19561 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
19562 CompressMask);
19563 }
19564 NewLI =
19565 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
19566 } else if (E->State == TreeEntry::StridedVectorize) {
19567 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19568 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19569 PO = IsReverseOrder ? PtrN : Ptr0;
19570 Type *StrideTy = DL->getIndexType(PO->getType());
19571 Value *StrideVal;
19572 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
19573 StridedLoadTy = SPtrInfo.Ty;
19574 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
19575 unsigned StridedLoadEC =
19576 StridedLoadTy->getElementCount().getKnownMinValue();
19577
19578 Value *Stride = SPtrInfo.StrideVal;
19579 if (!Stride) {
19580 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19581 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
19582 SCEVExpander Expander(*SE, *DL, "strided-load-vec");
19583 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
19584 &*Builder.GetInsertPoint());
19585 }
19586 Value *NewStride =
19587 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
19588 StrideVal = Builder.CreateMul(
19589 NewStride, ConstantInt::get(
19590 StrideTy, (IsReverseOrder ? -1 : 1) *
19591 static_cast<int>(
19592 DL->getTypeAllocSize(ScalarTy))));
19593 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19594 auto *Inst = Builder.CreateIntrinsic(
19595 Intrinsic::experimental_vp_strided_load,
19596 {StridedLoadTy, PO->getType(), StrideTy},
19597 {PO, StrideVal,
19598 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
19599 Builder.getInt32(StridedLoadEC)});
19600 Inst->addParamAttr(
19601 /*ArgNo=*/0,
19602 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19603 NewLI = Inst;
19604 } else {
19605 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
19606 Value *VecPtr = vectorizeOperand(E, 0);
19607 if (isa<FixedVectorType>(ScalarTy)) {
19608 assert(SLPReVec && "FixedVectorType is not expected.");
19609 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
19610 // to expand VecPtr if ScalarTy is a vector type.
19611 unsigned ScalarTyNumElements =
19612 cast<FixedVectorType>(ScalarTy)->getNumElements();
19613 unsigned VecTyNumElements =
19614 cast<FixedVectorType>(VecTy)->getNumElements();
19615 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19616 "Cannot expand getelementptr.");
19617 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19618 SmallVector<Constant *> Indices(VecTyNumElements);
19619 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
19620 return Builder.getInt64(I % ScalarTyNumElements);
19621 });
19622 VecPtr = Builder.CreateGEP(
19623 VecTy->getElementType(),
19624 Builder.CreateShuffleVector(
19625 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
19626 ConstantVector::get(Indices));
19627 }
19628 // Use the minimum alignment of the gathered loads.
19629 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19630 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19631 }
19632 Value *V = E->State == TreeEntry::CompressVectorize
19633 ? NewLI
19634 : ::propagateMetadata(NewLI, E->Scalars);
19635
19636 V = FinalShuffle(V, E);
19637 E->VectorizedValue = V;
19638 ++NumVectorInstructions;
19639 return V;
19640 }
19641 case Instruction::Store: {
19642 auto *SI = cast<StoreInst>(VL0);
19643
19644 setInsertPointAfterBundle(E);
19645
19646 Value *VecValue = vectorizeOperand(E, 0);
19647 if (VecValue->getType() != VecTy)
19648 VecValue =
19649 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19650 VecValue = FinalShuffle(VecValue, E);
19651
19652 Value *Ptr = SI->getPointerOperand();
19653 Instruction *ST;
19654 if (E->State == TreeEntry::Vectorize) {
19655 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
19656 } else {
19657 assert(E->State == TreeEntry::StridedVectorize &&
19658 "Expected either strided or consecutive stores.");
19659 if (!E->ReorderIndices.empty()) {
19660 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
19661 Ptr = SI->getPointerOperand();
19662 }
19663 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
19664 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
19665 auto *Inst = Builder.CreateIntrinsic(
19666 Intrinsic::experimental_vp_strided_store,
19667 {VecTy, Ptr->getType(), StrideTy},
19668 {VecValue, Ptr,
19669 ConstantInt::get(
19670 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19671 Builder.getAllOnesMask(VecTy->getElementCount()),
19672 Builder.getInt32(E->Scalars.size())});
19673 Inst->addParamAttr(
19674 /*ArgNo=*/1,
19675 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19676 ST = Inst;
19677 }
19678
19679 Value *V = ::propagateMetadata(ST, E->Scalars);
19680
19681 E->VectorizedValue = V;
19682 ++NumVectorInstructions;
19683 return V;
19684 }
19685 case Instruction::GetElementPtr: {
19686 auto *GEP0 = cast<GetElementPtrInst>(VL0);
19687 setInsertPointAfterBundle(E);
19688
19689 Value *Op0 = vectorizeOperand(E, 0);
19690
19691 SmallVector<Value *> OpVecs;
19692 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
19693 Value *OpVec = vectorizeOperand(E, J);
19694 OpVecs.push_back(OpVec);
19695 }
19696
19697 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19698 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
19700 for (Value *V : E->Scalars) {
19702 GEPs.push_back(V);
19703 }
19704 V = ::propagateMetadata(I, GEPs);
19705 }
19706
19707 V = FinalShuffle(V, E);
19708
19709 E->VectorizedValue = V;
19710 ++NumVectorInstructions;
19711
19712 return V;
19713 }
19714 case Instruction::Call: {
19715 CallInst *CI = cast<CallInst>(VL0);
19716 setInsertPointAfterBundle(E);
19717
19719
19721 CI, ID, VecTy->getNumElements(),
19722 It != MinBWs.end() ? It->second.first : 0, TTI);
19723 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
19724 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
19725 VecCallCosts.first <= VecCallCosts.second;
19726
19727 Value *ScalarArg = nullptr;
19728 SmallVector<Value *> OpVecs;
19729 SmallVector<Type *, 2> TysForDecl;
19730 // Add return type if intrinsic is overloaded on it.
19731 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
19732 TysForDecl.push_back(VecTy);
19733 auto *CEI = cast<CallInst>(VL0);
19734 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
19735 // Some intrinsics have scalar arguments. This argument should not be
19736 // vectorized.
19737 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
19738 ScalarArg = CEI->getArgOperand(I);
19739 // if decided to reduce bitwidth of abs intrinsic, it second argument
19740 // must be set false (do not return poison, if value issigned min).
19741 if (ID == Intrinsic::abs && It != MinBWs.end() &&
19742 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19743 ScalarArg = Builder.getFalse();
19744 OpVecs.push_back(ScalarArg);
19746 TysForDecl.push_back(ScalarArg->getType());
19747 continue;
19748 }
19749
19750 Value *OpVec = vectorizeOperand(E, I);
19751 ScalarArg = CEI->getArgOperand(I);
19752 if (cast<VectorType>(OpVec->getType())->getElementType() !=
19753 ScalarArg->getType()->getScalarType() &&
19754 It == MinBWs.end()) {
19755 auto *CastTy =
19756 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
19757 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
19758 } else if (It != MinBWs.end()) {
19759 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
19760 }
19761 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
19762 OpVecs.push_back(OpVec);
19763 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
19764 TysForDecl.push_back(OpVec->getType());
19765 }
19766
19767 Function *CF;
19768 if (!UseIntrinsic) {
19769 VFShape Shape =
19771 ElementCount::getFixed(VecTy->getNumElements()),
19772 false /*HasGlobalPred*/);
19773 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19774 } else {
19775 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
19776 }
19777
19779 CI->getOperandBundlesAsDefs(OpBundles);
19780 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
19781
19782 propagateIRFlags(V, E->Scalars, VL0);
19783 V = FinalShuffle(V, E);
19784
19785 E->VectorizedValue = V;
19786 ++NumVectorInstructions;
19787 return V;
19788 }
19789 case Instruction::ShuffleVector: {
19790 Value *V;
19791 if (SLPReVec && !E->isAltShuffle()) {
19792 setInsertPointAfterBundle(E);
19793 Value *Src = vectorizeOperand(E, 0);
19794 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
19795 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
19796 SmallVector<int> NewMask(ThisMask.size());
19797 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
19798 return SVSrc->getShuffleMask()[Mask];
19799 });
19800 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19801 SVSrc->getOperand(1), NewMask);
19802 } else {
19803 V = Builder.CreateShuffleVector(Src, ThisMask);
19804 }
19805 propagateIRFlags(V, E->Scalars, VL0);
19806 if (auto *I = dyn_cast<Instruction>(V))
19807 V = ::propagateMetadata(I, E->Scalars);
19808 V = FinalShuffle(V, E);
19809 } else {
19810 assert(E->isAltShuffle() &&
19811 ((Instruction::isBinaryOp(E->getOpcode()) &&
19812 Instruction::isBinaryOp(E->getAltOpcode())) ||
19813 (Instruction::isCast(E->getOpcode()) &&
19814 Instruction::isCast(E->getAltOpcode())) ||
19815 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
19816 "Invalid Shuffle Vector Operand");
19817
19818 Value *LHS = nullptr, *RHS = nullptr;
19819 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
19820 setInsertPointAfterBundle(E);
19821 LHS = vectorizeOperand(E, 0);
19822 RHS = vectorizeOperand(E, 1);
19823 } else {
19824 setInsertPointAfterBundle(E);
19825 LHS = vectorizeOperand(E, 0);
19826 }
19827 if (LHS && RHS &&
19828 ((Instruction::isBinaryOp(E->getOpcode()) &&
19829 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
19830 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
19831 assert((It != MinBWs.end() ||
19832 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
19833 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
19834 MinBWs.contains(getOperandEntry(E, 0)) ||
19835 MinBWs.contains(getOperandEntry(E, 1))) &&
19836 "Expected item in MinBWs.");
19837 Type *CastTy = VecTy;
19838 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
19840 ->getElementType()
19841 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
19842 ->getElementType()
19843 ->getIntegerBitWidth())
19844 CastTy = RHS->getType();
19845 else
19846 CastTy = LHS->getType();
19847 }
19848 if (LHS->getType() != CastTy)
19849 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
19850 if (RHS->getType() != CastTy)
19851 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
19852 }
19853
19854 Value *V0, *V1;
19855 if (Instruction::isBinaryOp(E->getOpcode())) {
19856 V0 = Builder.CreateBinOp(
19857 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
19858 V1 = Builder.CreateBinOp(
19859 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
19860 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
19861 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
19862 auto *AltCI = cast<CmpInst>(E->getAltOp());
19863 CmpInst::Predicate AltPred = AltCI->getPredicate();
19864 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
19865 } else {
19866 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
19867 unsigned SrcBWSz = DL->getTypeSizeInBits(
19868 cast<VectorType>(LHS->getType())->getElementType());
19869 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19870 if (BWSz <= SrcBWSz) {
19871 if (BWSz < SrcBWSz)
19872 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
19873 assert(LHS->getType() == VecTy &&
19874 "Expected same type as operand.");
19875 if (auto *I = dyn_cast<Instruction>(LHS))
19876 LHS = ::propagateMetadata(I, E->Scalars);
19877 LHS = FinalShuffle(LHS, E);
19878 E->VectorizedValue = LHS;
19879 ++NumVectorInstructions;
19880 return LHS;
19881 }
19882 }
19883 V0 = Builder.CreateCast(
19884 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
19885 V1 = Builder.CreateCast(
19886 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
19887 }
19888 // Add V0 and V1 to later analysis to try to find and remove matching
19889 // instruction, if any.
19890 for (Value *V : {V0, V1}) {
19891 if (auto *I = dyn_cast<Instruction>(V)) {
19892 GatherShuffleExtractSeq.insert(I);
19893 CSEBlocks.insert(I->getParent());
19894 }
19895 }
19896
19897 // Create shuffle to take alternate operations from the vector.
19898 // Also, gather up main and alt scalar ops to propagate IR flags to
19899 // each vector operation.
19900 ValueList OpScalars, AltScalars;
19901 SmallVector<int> Mask;
19902 E->buildAltOpShuffleMask(
19903 [E, this](Instruction *I) {
19904 assert(E->getMatchingMainOpOrAltOp(I) &&
19905 "Unexpected main/alternate opcode");
19906 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
19907 *TLI);
19908 },
19909 Mask, &OpScalars, &AltScalars);
19910
19911 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
19912 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
19913 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
19914 // Drop nuw flags for abs(sub(commutative), true).
19915 if (auto *I = dyn_cast<Instruction>(Vec);
19916 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
19917 any_of(E->Scalars, [](Value *V) {
19918 if (isa<PoisonValue>(V))
19919 return false;
19920 auto *IV = cast<Instruction>(V);
19921 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
19922 }))
19923 I->setHasNoUnsignedWrap(/*b=*/false);
19924 };
19925 DropNuwFlag(V0, E->getOpcode());
19926 DropNuwFlag(V1, E->getAltOpcode());
19927
19928 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
19929 assert(SLPReVec && "FixedVectorType is not expected.");
19930 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
19931 }
19932 V = Builder.CreateShuffleVector(V0, V1, Mask);
19933 if (auto *I = dyn_cast<Instruction>(V)) {
19934 V = ::propagateMetadata(I, E->Scalars);
19935 GatherShuffleExtractSeq.insert(I);
19936 CSEBlocks.insert(I->getParent());
19937 }
19938 }
19939
19940 E->VectorizedValue = V;
19941 ++NumVectorInstructions;
19942
19943 return V;
19944 }
19945 default:
19946 llvm_unreachable("unknown inst");
19947 }
19948 return nullptr;
19949}
19950
19952 ExtraValueToDebugLocsMap ExternallyUsedValues;
19953 return vectorizeTree(ExternallyUsedValues);
19954}
19955
19957 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
19958 Instruction *ReductionRoot,
19959 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
19960 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
19961 // need to rebuild it.
19962 EntryToLastInstruction.clear();
19963 // All blocks must be scheduled before any instructions are inserted.
19964 for (auto &BSIter : BlocksSchedules)
19965 scheduleBlock(*this, BSIter.second.get());
19966 // Cache last instructions for the nodes to avoid side effects, which may
19967 // appear during vectorization, like extra uses, etc.
19968 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19969 if (TE->isGather())
19970 continue;
19971 (void)getLastInstructionInBundle(TE.get());
19972 }
19973
19974 if (ReductionRoot)
19975 Builder.SetInsertPoint(ReductionRoot->getParent(),
19976 ReductionRoot->getIterator());
19977 else
19978 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
19979
19980 // Vectorize gather operands of the nodes with the external uses only.
19982 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19983 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
19984 TE->UserTreeIndex.UserTE->hasState() &&
19985 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
19986 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
19987 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
19988 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
19989 all_of(TE->UserTreeIndex.UserTE->Scalars,
19990 [](Value *V) { return isUsedOutsideBlock(V); })) {
19991 Instruction &LastInst =
19992 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
19993 GatherEntries.emplace_back(TE.get(), &LastInst);
19994 }
19995 }
19996 for (auto &Entry : GatherEntries) {
19997 IRBuilderBase::InsertPointGuard Guard(Builder);
19998 Builder.SetInsertPoint(Entry.second);
19999 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20000 (void)vectorizeTree(Entry.first);
20001 }
20002 // Emit gathered loads first to emit better code for the users of those
20003 // gathered loads.
20004 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20005 if (GatheredLoadsEntriesFirst.has_value() &&
20006 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20007 (!TE->isGather() || TE->UserTreeIndex)) {
20008 assert((TE->UserTreeIndex ||
20009 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20010 "Expected gathered load node.");
20011 (void)vectorizeTree(TE.get());
20012 }
20013 }
20014 (void)vectorizeTree(VectorizableTree[0].get());
20015 // Run through the list of postponed gathers and emit them, replacing the temp
20016 // emitted allocas with actual vector instructions.
20017 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20019 for (const TreeEntry *E : PostponedNodes) {
20020 auto *TE = const_cast<TreeEntry *>(E);
20021 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20022 TE->VectorizedValue = nullptr;
20023 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20024 // If user is a PHI node, its vector code have to be inserted right before
20025 // block terminator. Since the node was delayed, there were some unresolved
20026 // dependencies at the moment when stab instruction was emitted. In a case
20027 // when any of these dependencies turn out an operand of another PHI, coming
20028 // from this same block, position of a stab instruction will become invalid.
20029 // The is because source vector that supposed to feed this gather node was
20030 // inserted at the end of the block [after stab instruction]. So we need
20031 // to adjust insertion point again to the end of block.
20032 if (isa<PHINode>(UserI)) {
20033 // Insert before all users.
20034 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20035 for (User *U : PrevVec->users()) {
20036 if (U == UserI)
20037 continue;
20038 auto *UI = dyn_cast<Instruction>(U);
20039 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20040 continue;
20041 if (UI->comesBefore(InsertPt))
20042 InsertPt = UI;
20043 }
20044 Builder.SetInsertPoint(InsertPt);
20045 } else {
20046 Builder.SetInsertPoint(PrevVec);
20047 }
20048 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20049 Value *Vec = vectorizeTree(TE);
20050 if (auto *VecI = dyn_cast<Instruction>(Vec);
20051 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20052 Builder.GetInsertPoint()->comesBefore(VecI))
20053 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20054 Builder.GetInsertPoint());
20055 if (Vec->getType() != PrevVec->getType()) {
20056 assert(Vec->getType()->isIntOrIntVectorTy() &&
20057 PrevVec->getType()->isIntOrIntVectorTy() &&
20058 "Expected integer vector types only.");
20059 std::optional<bool> IsSigned;
20060 for (Value *V : TE->Scalars) {
20061 if (isVectorized(V)) {
20062 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20063 auto It = MinBWs.find(MNTE);
20064 if (It != MinBWs.end()) {
20065 IsSigned = IsSigned.value_or(false) || It->second.second;
20066 if (*IsSigned)
20067 break;
20068 }
20069 }
20070 if (IsSigned.value_or(false))
20071 break;
20072 // Scan through gather nodes.
20073 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20074 auto It = MinBWs.find(BVE);
20075 if (It != MinBWs.end()) {
20076 IsSigned = IsSigned.value_or(false) || It->second.second;
20077 if (*IsSigned)
20078 break;
20079 }
20080 }
20081 if (IsSigned.value_or(false))
20082 break;
20083 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20084 IsSigned =
20085 IsSigned.value_or(false) ||
20086 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20087 continue;
20088 }
20089 if (IsSigned.value_or(false))
20090 break;
20091 }
20092 }
20093 if (IsSigned.value_or(false)) {
20094 // Final attempt - check user node.
20095 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20096 if (It != MinBWs.end())
20097 IsSigned = It->second.second;
20098 }
20099 assert(IsSigned &&
20100 "Expected user node or perfect diamond match in MinBWs.");
20101 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20102 }
20103 PrevVec->replaceAllUsesWith(Vec);
20104 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20105 // Replace the stub vector node, if it was used before for one of the
20106 // buildvector nodes already.
20107 auto It = PostponedValues.find(PrevVec);
20108 if (It != PostponedValues.end()) {
20109 for (TreeEntry *VTE : It->getSecond())
20110 VTE->VectorizedValue = Vec;
20111 }
20112 eraseInstruction(PrevVec);
20113 }
20114
20115 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
20116 << " values .\n");
20117
20119 // Maps vector instruction to original insertelement instruction
20120 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
20121 // Maps extract Scalar to the corresponding extractelement instruction in the
20122 // basic block. Only one extractelement per block should be emitted.
20124 ScalarToEEs;
20125 SmallDenseSet<Value *, 4> UsedInserts;
20127 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
20129 // Extract all of the elements with the external uses.
20130 for (const auto &ExternalUse : ExternalUses) {
20131 Value *Scalar = ExternalUse.Scalar;
20132 llvm::User *User = ExternalUse.User;
20133
20134 // Skip users that we already RAUW. This happens when one instruction
20135 // has multiple uses of the same value.
20136 if (User && !is_contained(Scalar->users(), User))
20137 continue;
20138 const TreeEntry *E = &ExternalUse.E;
20139 assert(E && "Invalid scalar");
20140 assert(!E->isGather() && "Extracting from a gather list");
20141 // Non-instruction pointers are not deleted, just skip them.
20142 if (E->getOpcode() == Instruction::GetElementPtr &&
20143 !isa<GetElementPtrInst>(Scalar))
20144 continue;
20145
20146 Value *Vec = E->VectorizedValue;
20147 assert(Vec && "Can't find vectorizable value");
20148
20149 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20150 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
20151 if (Scalar->getType() != Vec->getType()) {
20152 Value *Ex = nullptr;
20153 Value *ExV = nullptr;
20154 auto *Inst = dyn_cast<Instruction>(Scalar);
20155 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20156 auto It = ScalarToEEs.find(Scalar);
20157 if (It != ScalarToEEs.end()) {
20158 // No need to emit many extracts, just move the only one in the
20159 // current block.
20160 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20161 : Builder.GetInsertBlock());
20162 if (EEIt != It->second.end()) {
20163 Value *PrevV = EEIt->second.first;
20164 if (auto *I = dyn_cast<Instruction>(PrevV);
20165 I && !ReplaceInst &&
20166 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20167 Builder.GetInsertPoint()->comesBefore(I)) {
20168 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20169 Builder.GetInsertPoint());
20170 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20171 CI->moveAfter(I);
20172 }
20173 Ex = PrevV;
20174 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20175 }
20176 }
20177 if (!Ex) {
20178 // "Reuse" the existing extract to improve final codegen.
20179 if (ReplaceInst) {
20180 // Leave the instruction as is, if it cheaper extracts and all
20181 // operands are scalar.
20182 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20183 IgnoredExtracts.insert(EE);
20184 Ex = EE;
20185 } else {
20186 auto *CloneInst = Inst->clone();
20187 CloneInst->insertBefore(Inst->getIterator());
20188 if (Inst->hasName())
20189 CloneInst->takeName(Inst);
20190 Ex = CloneInst;
20191 }
20192 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20193 ES && isa<Instruction>(Vec)) {
20194 Value *V = ES->getVectorOperand();
20195 auto *IVec = cast<Instruction>(Vec);
20196 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
20197 V = ETEs.front()->VectorizedValue;
20198 if (auto *IV = dyn_cast<Instruction>(V);
20199 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
20200 IV->comesBefore(IVec))
20201 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20202 else
20203 Ex = Builder.CreateExtractElement(Vec, Lane);
20204 } else if (auto *VecTy =
20205 dyn_cast<FixedVectorType>(Scalar->getType())) {
20206 assert(SLPReVec && "FixedVectorType is not expected.");
20207 unsigned VecTyNumElements = VecTy->getNumElements();
20208 // When REVEC is enabled, we need to extract a vector.
20209 // Note: The element size of Scalar may be different from the
20210 // element size of Vec.
20211 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
20212 ExternalUse.Lane * VecTyNumElements);
20213 } else {
20214 Ex = Builder.CreateExtractElement(Vec, Lane);
20215 }
20216 // If necessary, sign-extend or zero-extend ScalarRoot
20217 // to the larger type.
20218 ExV = Ex;
20219 if (Scalar->getType() != Ex->getType())
20220 ExV = Builder.CreateIntCast(
20221 Ex, Scalar->getType(),
20222 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
20223 auto *I = dyn_cast<Instruction>(Ex);
20224 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
20225 : &F->getEntryBlock(),
20226 std::make_pair(Ex, ExV));
20227 }
20228 // The then branch of the previous if may produce constants, since 0
20229 // operand might be a constant.
20230 if (auto *ExI = dyn_cast<Instruction>(Ex);
20231 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
20232 GatherShuffleExtractSeq.insert(ExI);
20233 CSEBlocks.insert(ExI->getParent());
20234 }
20235 return ExV;
20236 }
20237 assert(isa<FixedVectorType>(Scalar->getType()) &&
20238 isa<InsertElementInst>(Scalar) &&
20239 "In-tree scalar of vector type is not insertelement?");
20240 auto *IE = cast<InsertElementInst>(Scalar);
20241 VectorToInsertElement.try_emplace(Vec, IE);
20242 return Vec;
20243 };
20244 // If User == nullptr, the Scalar remains as scalar in vectorized
20245 // instructions or is used as extra arg. Generate ExtractElement instruction
20246 // and update the record for this scalar in ExternallyUsedValues.
20247 if (!User) {
20248 if (!ScalarsWithNullptrUser.insert(Scalar).second)
20249 continue;
20250 assert(
20251 (ExternallyUsedValues.count(Scalar) ||
20252 ExternalUsesWithNonUsers.count(Scalar) ||
20253 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20254 any_of(
20255 Scalar->users(),
20256 [&, TTI = TTI](llvm::User *U) {
20257 if (ExternalUsesAsOriginalScalar.contains(U))
20258 return true;
20259 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20260 return !UseEntries.empty() &&
20261 (E->State == TreeEntry::Vectorize ||
20262 E->State == TreeEntry::StridedVectorize ||
20263 E->State == TreeEntry::CompressVectorize) &&
20264 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20265 return (UseEntry->State == TreeEntry::Vectorize ||
20266 UseEntry->State ==
20267 TreeEntry::StridedVectorize ||
20268 UseEntry->State ==
20269 TreeEntry::CompressVectorize) &&
20270 doesInTreeUserNeedToExtract(
20271 Scalar, getRootEntryInstruction(*UseEntry),
20272 TLI, TTI);
20273 });
20274 })) &&
20275 "Scalar with nullptr User must be registered in "
20276 "ExternallyUsedValues map or remain as scalar in vectorized "
20277 "instructions");
20278 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20279 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
20280 if (PHI->getParent()->isLandingPad())
20281 Builder.SetInsertPoint(
20282 PHI->getParent(),
20283 std::next(
20284 PHI->getParent()->getLandingPadInst()->getIterator()));
20285 else
20286 Builder.SetInsertPoint(PHI->getParent(),
20287 PHI->getParent()->getFirstNonPHIIt());
20288 } else {
20289 Builder.SetInsertPoint(VecI->getParent(),
20290 std::next(VecI->getIterator()));
20291 }
20292 } else {
20293 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20294 }
20295 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20296 // Required to update internally referenced instructions.
20297 if (Scalar != NewInst) {
20298 assert((!isa<ExtractElementInst>(Scalar) ||
20299 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
20300 "Extractelements should not be replaced.");
20301 Scalar->replaceAllUsesWith(NewInst);
20302 }
20303 continue;
20304 }
20305
20306 if (auto *VU = dyn_cast<InsertElementInst>(User);
20307 VU && VU->getOperand(1) == Scalar) {
20308 // Skip if the scalar is another vector op or Vec is not an instruction.
20309 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20310 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
20311 if (!UsedInserts.insert(VU).second)
20312 continue;
20313 // Need to use original vector, if the root is truncated.
20314 auto BWIt = MinBWs.find(E);
20315 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
20316 auto *ScalarTy = FTy->getElementType();
20317 auto Key = std::make_pair(Vec, ScalarTy);
20318 auto VecIt = VectorCasts.find(Key);
20319 if (VecIt == VectorCasts.end()) {
20320 IRBuilderBase::InsertPointGuard Guard(Builder);
20321 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
20322 if (IVec->getParent()->isLandingPad())
20323 Builder.SetInsertPoint(IVec->getParent(),
20324 std::next(IVec->getParent()
20325 ->getLandingPadInst()
20326 ->getIterator()));
20327 else
20328 Builder.SetInsertPoint(
20329 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20330 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
20331 Builder.SetInsertPoint(IVec->getNextNode());
20332 }
20333 Vec = Builder.CreateIntCast(
20334 Vec,
20336 ScalarTy,
20337 cast<FixedVectorType>(Vec->getType())->getNumElements()),
20338 BWIt->second.second);
20339 VectorCasts.try_emplace(Key, Vec);
20340 } else {
20341 Vec = VecIt->second;
20342 }
20343 }
20344
20345 std::optional<unsigned> InsertIdx = getElementIndex(VU);
20346 if (InsertIdx) {
20347 auto *It = find_if(
20348 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
20349 // Checks if 2 insertelements are from the same buildvector.
20350 InsertElementInst *VecInsert = Data.InsertElements.front();
20352 VU, VecInsert,
20353 [](InsertElementInst *II) { return II->getOperand(0); });
20354 });
20355 unsigned Idx = *InsertIdx;
20356 if (It == ShuffledInserts.end()) {
20357 (void)ShuffledInserts.emplace_back();
20358 It = std::next(ShuffledInserts.begin(),
20359 ShuffledInserts.size() - 1);
20360 }
20361 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
20362 if (Mask.empty())
20363 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
20364 Mask[Idx] = ExternalUse.Lane;
20365 It->InsertElements.push_back(cast<InsertElementInst>(User));
20366 continue;
20367 }
20368 }
20369 }
20370 }
20371
20372 // Generate extracts for out-of-tree users.
20373 // Find the insertion point for the extractelement lane.
20374 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20375 if (PHINode *PH = dyn_cast<PHINode>(User)) {
20376 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20377 if (PH->getIncomingValue(I) == Scalar) {
20378 Instruction *IncomingTerminator =
20379 PH->getIncomingBlock(I)->getTerminator();
20380 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20381 Builder.SetInsertPoint(VecI->getParent(),
20382 std::next(VecI->getIterator()));
20383 } else {
20384 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
20385 }
20386 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20387 PH->setOperand(I, NewInst);
20388 }
20389 }
20390 } else {
20391 Builder.SetInsertPoint(cast<Instruction>(User));
20392 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20393 User->replaceUsesOfWith(Scalar, NewInst);
20394 }
20395 } else {
20396 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20397 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20398 User->replaceUsesOfWith(Scalar, NewInst);
20399 }
20400
20401 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
20402 }
20403
20404 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20405 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
20406 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
20407 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
20408 for (int I = 0, E = Mask.size(); I < E; ++I) {
20409 if (Mask[I] < VF)
20410 CombinedMask1[I] = Mask[I];
20411 else
20412 CombinedMask2[I] = Mask[I] - VF;
20413 }
20414 ShuffleInstructionBuilder ShuffleBuilder(
20415 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
20416 ShuffleBuilder.add(V1, CombinedMask1);
20417 if (V2)
20418 ShuffleBuilder.add(V2, CombinedMask2);
20419 return ShuffleBuilder.finalize({}, {}, {});
20420 };
20421
20422 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
20423 bool ForSingleMask) {
20424 unsigned VF = Mask.size();
20425 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20426 if (VF != VecVF) {
20427 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
20428 Vec = CreateShuffle(Vec, nullptr, Mask);
20429 return std::make_pair(Vec, true);
20430 }
20431 if (!ForSingleMask) {
20432 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20433 for (unsigned I = 0; I < VF; ++I) {
20434 if (Mask[I] != PoisonMaskElem)
20435 ResizeMask[Mask[I]] = Mask[I];
20436 }
20437 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
20438 }
20439 }
20440
20441 return std::make_pair(Vec, false);
20442 };
20443 // Perform shuffling of the vectorize tree entries for better handling of
20444 // external extracts.
20445 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20446 // Find the first and the last instruction in the list of insertelements.
20447 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
20448 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
20449 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
20450 Builder.SetInsertPoint(LastInsert);
20451 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20453 MutableArrayRef(Vector.data(), Vector.size()),
20454 FirstInsert->getOperand(0),
20455 [](Value *Vec) {
20456 return cast<VectorType>(Vec->getType())
20457 ->getElementCount()
20458 .getKnownMinValue();
20459 },
20460 ResizeToVF,
20461 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20462 ArrayRef<Value *> Vals) {
20463 assert((Vals.size() == 1 || Vals.size() == 2) &&
20464 "Expected exactly 1 or 2 input values.");
20465 if (Vals.size() == 1) {
20466 // Do not create shuffle if the mask is a simple identity
20467 // non-resizing mask.
20468 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20469 ->getNumElements() ||
20470 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20471 return CreateShuffle(Vals.front(), nullptr, Mask);
20472 return Vals.front();
20473 }
20474 return CreateShuffle(Vals.front() ? Vals.front()
20475 : FirstInsert->getOperand(0),
20476 Vals.back(), Mask);
20477 });
20478 auto It = ShuffledInserts[I].InsertElements.rbegin();
20479 // Rebuild buildvector chain.
20480 InsertElementInst *II = nullptr;
20481 if (It != ShuffledInserts[I].InsertElements.rend())
20482 II = *It;
20484 while (It != ShuffledInserts[I].InsertElements.rend()) {
20485 assert(II && "Must be an insertelement instruction.");
20486 if (*It == II)
20487 ++It;
20488 else
20489 Inserts.push_back(cast<Instruction>(II));
20490 II = dyn_cast<InsertElementInst>(II->getOperand(0));
20491 }
20492 for (Instruction *II : reverse(Inserts)) {
20493 II->replaceUsesOfWith(II->getOperand(0), NewInst);
20494 if (auto *NewI = dyn_cast<Instruction>(NewInst))
20495 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
20496 II->moveAfter(NewI);
20497 NewInst = II;
20498 }
20499 LastInsert->replaceAllUsesWith(NewInst);
20500 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
20501 IE->replaceUsesOfWith(IE->getOperand(0),
20502 PoisonValue::get(IE->getOperand(0)->getType()));
20503 IE->replaceUsesOfWith(IE->getOperand(1),
20504 PoisonValue::get(IE->getOperand(1)->getType()));
20505 eraseInstruction(IE);
20506 }
20507 CSEBlocks.insert(LastInsert->getParent());
20508 }
20509
20510 SmallVector<Instruction *> RemovedInsts;
20511 // For each vectorized value:
20512 for (auto &TEPtr : VectorizableTree) {
20513 TreeEntry *Entry = TEPtr.get();
20514
20515 // No need to handle users of gathered values.
20516 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
20517 continue;
20518
20519 assert(Entry->VectorizedValue && "Can't find vectorizable value");
20520
20521 // For each lane:
20522 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
20523 Value *Scalar = Entry->Scalars[Lane];
20524
20525 if (Entry->getOpcode() == Instruction::GetElementPtr &&
20526 !isa<GetElementPtrInst>(Scalar))
20527 continue;
20528 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20529 EE && IgnoredExtracts.contains(EE))
20530 continue;
20531 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
20532 continue;
20533#ifndef NDEBUG
20534 Type *Ty = Scalar->getType();
20535 if (!Ty->isVoidTy()) {
20536 for (User *U : Scalar->users()) {
20537 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
20538
20539 // It is legal to delete users in the ignorelist.
20540 assert((isVectorized(U) ||
20541 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20544 "Deleting out-of-tree value");
20545 }
20546 }
20547#endif
20548 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
20549 auto *I = cast<Instruction>(Scalar);
20550 RemovedInsts.push_back(I);
20551 }
20552 }
20553
20554 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
20555 // new vector instruction.
20556 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20557 V->mergeDIAssignID(RemovedInsts);
20558
20559 // Clear up reduction references, if any.
20560 if (UserIgnoreList) {
20561 for (Instruction *I : RemovedInsts) {
20562 const TreeEntry *IE = getTreeEntries(I).front();
20563 if (IE->Idx != 0 &&
20564 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
20565 (ValueToGatherNodes.lookup(I).contains(
20566 VectorizableTree.front().get()) ||
20567 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20568 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20569 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20570 IE->UserTreeIndex &&
20571 is_contained(VectorizableTree.front()->Scalars, I)) &&
20572 !(GatheredLoadsEntriesFirst.has_value() &&
20573 IE->Idx >= *GatheredLoadsEntriesFirst &&
20574 VectorizableTree.front()->isGather() &&
20575 is_contained(VectorizableTree.front()->Scalars, I)) &&
20576 !(!VectorizableTree.front()->isGather() &&
20577 VectorizableTree.front()->isCopyableElement(I)))
20578 continue;
20579 SmallVector<SelectInst *> LogicalOpSelects;
20580 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
20581 // Do not replace condition of the logical op in form select <cond>.
20582 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20583 (match(U.getUser(), m_LogicalAnd()) ||
20584 match(U.getUser(), m_LogicalOr())) &&
20585 U.getOperandNo() == 0;
20586 if (IsPoisoningLogicalOp) {
20587 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20588 return false;
20589 }
20590 return UserIgnoreList->contains(U.getUser());
20591 });
20592 // Replace conditions of the poisoning logical ops with the non-poison
20593 // constant value.
20594 for (SelectInst *SI : LogicalOpSelects)
20595 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
20596 }
20597 }
20598 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
20599 // cache correctness.
20600 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
20601 // - instructions are not deleted until later.
20602 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
20603
20604 Builder.ClearInsertionPoint();
20605 InstrElementSize.clear();
20606
20607 const TreeEntry &RootTE = *VectorizableTree.front();
20608 Value *Vec = RootTE.VectorizedValue;
20609 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20610 It != MinBWs.end() &&
20611 ReductionBitWidth != It->second.first) {
20612 IRBuilder<>::InsertPointGuard Guard(Builder);
20613 Builder.SetInsertPoint(ReductionRoot->getParent(),
20614 ReductionRoot->getIterator());
20615 Vec = Builder.CreateIntCast(
20616 Vec,
20617 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20618 cast<VectorType>(Vec->getType())->getElementCount()),
20619 It->second.second);
20620 }
20621 return Vec;
20622}
20623
20625 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
20626 << " gather sequences instructions.\n");
20627 // LICM InsertElementInst sequences.
20628 for (Instruction *I : GatherShuffleExtractSeq) {
20629 if (isDeleted(I))
20630 continue;
20631
20632 // Check if this block is inside a loop.
20633 Loop *L = LI->getLoopFor(I->getParent());
20634 if (!L)
20635 continue;
20636
20637 // Check if it has a preheader.
20638 BasicBlock *PreHeader = L->getLoopPreheader();
20639 if (!PreHeader)
20640 continue;
20641
20642 // If the vector or the element that we insert into it are
20643 // instructions that are defined in this basic block then we can't
20644 // hoist this instruction.
20645 if (any_of(I->operands(), [L](Value *V) {
20646 auto *OpI = dyn_cast<Instruction>(V);
20647 return OpI && L->contains(OpI);
20648 }))
20649 continue;
20650
20651 // We can hoist this instruction. Move it to the pre-header.
20652 I->moveBefore(PreHeader->getTerminator()->getIterator());
20653 CSEBlocks.insert(PreHeader);
20654 }
20655
20656 // Make a list of all reachable blocks in our CSE queue.
20658 CSEWorkList.reserve(CSEBlocks.size());
20659 for (BasicBlock *BB : CSEBlocks)
20660 if (DomTreeNode *N = DT->getNode(BB)) {
20661 assert(DT->isReachableFromEntry(N));
20662 CSEWorkList.push_back(N);
20663 }
20664
20665 // Sort blocks by domination. This ensures we visit a block after all blocks
20666 // dominating it are visited.
20667 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
20668 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
20669 "Different nodes should have different DFS numbers");
20670 return A->getDFSNumIn() < B->getDFSNumIn();
20671 });
20672
20673 // Less defined shuffles can be replaced by the more defined copies.
20674 // Between two shuffles one is less defined if it has the same vector operands
20675 // and its mask indeces are the same as in the first one or undefs. E.g.
20676 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
20677 // poison, <0, 0, 0, 0>.
20678 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
20679 Instruction *I2,
20680 SmallVectorImpl<int> &NewMask) {
20681 if (I1->getType() != I2->getType())
20682 return false;
20683 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
20684 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
20685 if (!SI1 || !SI2)
20686 return I1->isIdenticalTo(I2);
20687 if (SI1->isIdenticalTo(SI2))
20688 return true;
20689 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
20690 if (SI1->getOperand(I) != SI2->getOperand(I))
20691 return false;
20692 // Check if the second instruction is more defined than the first one.
20693 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20694 ArrayRef<int> SM1 = SI1->getShuffleMask();
20695 // Count trailing undefs in the mask to check the final number of used
20696 // registers.
20697 unsigned LastUndefsCnt = 0;
20698 for (int I = 0, E = NewMask.size(); I < E; ++I) {
20699 if (SM1[I] == PoisonMaskElem)
20700 ++LastUndefsCnt;
20701 else
20702 LastUndefsCnt = 0;
20703 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
20704 NewMask[I] != SM1[I])
20705 return false;
20706 if (NewMask[I] == PoisonMaskElem)
20707 NewMask[I] = SM1[I];
20708 }
20709 // Check if the last undefs actually change the final number of used vector
20710 // registers.
20711 return SM1.size() - LastUndefsCnt > 1 &&
20712 ::getNumberOfParts(*TTI, SI1->getType()) ==
20714 *TTI, getWidenedType(SI1->getType()->getElementType(),
20715 SM1.size() - LastUndefsCnt));
20716 };
20717 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
20718 // instructions. TODO: We can further optimize this scan if we split the
20719 // instructions into different buckets based on the insert lane.
20721 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
20722 assert(*I &&
20723 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
20724 "Worklist not sorted properly!");
20725 BasicBlock *BB = (*I)->getBlock();
20726 // For all instructions in blocks containing gather sequences:
20727 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
20728 if (isDeleted(&In))
20729 continue;
20731 !GatherShuffleExtractSeq.contains(&In))
20732 continue;
20733
20734 // Check if we can replace this instruction with any of the
20735 // visited instructions.
20736 bool Replaced = false;
20737 for (Instruction *&V : Visited) {
20738 SmallVector<int> NewMask;
20739 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20740 DT->dominates(V->getParent(), In.getParent())) {
20741 In.replaceAllUsesWith(V);
20742 eraseInstruction(&In);
20743 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
20744 if (!NewMask.empty())
20745 SI->setShuffleMask(NewMask);
20746 Replaced = true;
20747 break;
20748 }
20750 GatherShuffleExtractSeq.contains(V) &&
20751 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20752 DT->dominates(In.getParent(), V->getParent())) {
20753 In.moveAfter(V);
20754 V->replaceAllUsesWith(&In);
20756 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
20757 if (!NewMask.empty())
20758 SI->setShuffleMask(NewMask);
20759 V = &In;
20760 Replaced = true;
20761 break;
20762 }
20763 }
20764 if (!Replaced) {
20765 assert(!is_contained(Visited, &In));
20766 Visited.push_back(&In);
20767 }
20768 }
20769 }
20770 CSEBlocks.clear();
20771 GatherShuffleExtractSeq.clear();
20772}
20773
20774BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20775 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
20776 auto &BundlePtr =
20777 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20778 for (Value *V : VL) {
20779 if (S.isNonSchedulable(V))
20780 continue;
20781 auto *I = cast<Instruction>(V);
20782 if (S.isCopyableElement(V)) {
20783 // Add a copyable element model.
20784 ScheduleCopyableData &SD =
20785 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
20786 // Group the instructions to a bundle.
20787 BundlePtr->add(&SD);
20788 continue;
20789 }
20790 ScheduleData *BundleMember = getScheduleData(V);
20791 assert(BundleMember && "no ScheduleData for bundle member "
20792 "(maybe not in same basic block)");
20793 // Group the instructions to a bundle.
20794 BundlePtr->add(BundleMember);
20795 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
20796 BundlePtr.get());
20797 }
20798 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
20799 return *BundlePtr;
20800}
20801
20802// Groups the instructions to a bundle (which is then a single scheduling entity)
20803// and schedules instructions until the bundle gets ready.
20804std::optional<BoUpSLP::ScheduleBundle *>
20805BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
20806 const InstructionsState &S,
20807 const EdgeInfo &EI) {
20808 // No need to schedule PHIs, insertelement, extractelement and extractvalue
20809 // instructions.
20810 if (isa<PHINode>(S.getMainOp()) ||
20811 isVectorLikeInstWithConstOps(S.getMainOp()))
20812 return nullptr;
20813 bool HasCopyables = S.areInstructionsWithCopyableElements();
20814 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
20815 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
20816 // If all operands were replaced by copyables, the operands of this node
20817 // might be not, so need to recalculate dependencies for schedule data,
20818 // replaced by copyable schedule data.
20819 SmallVector<ScheduleData *> ControlDependentMembers;
20820 for (Value *V : VL) {
20821 auto *I = dyn_cast<Instruction>(V);
20822 if (!I || (HasCopyables && S.isCopyableElement(V)))
20823 continue;
20824 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
20825 for (const Use &U : I->operands()) {
20826 unsigned &NumOps =
20827 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
20828 .first->getSecond();
20829 ++NumOps;
20830 if (auto *Op = dyn_cast<Instruction>(U.get());
20831 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
20832 if (ScheduleData *OpSD = getScheduleData(Op);
20833 OpSD && OpSD->hasValidDependencies()) {
20834 OpSD->clearDirectDependencies();
20835 if (RegionHasStackSave ||
20837 ControlDependentMembers.push_back(OpSD);
20838 }
20839 }
20840 }
20841 }
20842 if (!ControlDependentMembers.empty()) {
20843 ScheduleBundle Invalid = ScheduleBundle::invalid();
20844 calculateDependencies(Invalid, /*InsertInReadyList=*/true, SLP,
20845 ControlDependentMembers);
20846 }
20847 return nullptr;
20848 }
20849
20850 // Initialize the instruction bundle.
20851 Instruction *OldScheduleEnd = ScheduleEnd;
20852 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
20853
20854 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
20855 // Clear deps or recalculate the region, if the memory instruction is a
20856 // copyable. It may have memory deps, which must be recalculated.
20857 SmallVector<ScheduleData *> ControlDependentMembers;
20858 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20859 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
20860 for (ScheduleEntity *SE : Bundle.getBundle()) {
20861 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
20862 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20863 BundleMember && BundleMember->hasValidDependencies()) {
20864 BundleMember->clearDirectDependencies();
20865 if (RegionHasStackSave ||
20867 BundleMember->getInst()))
20868 ControlDependentMembers.push_back(BundleMember);
20869 }
20870 continue;
20871 }
20872 auto *SD = cast<ScheduleData>(SE);
20873 if (SD->hasValidDependencies() &&
20874 (!S.areInstructionsWithCopyableElements() ||
20875 !S.isCopyableElement(SD->getInst())) &&
20876 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
20877 EI.UserTE->hasState() &&
20878 (!EI.UserTE->hasCopyableElements() ||
20879 !EI.UserTE->isCopyableElement(SD->getInst())))
20880 SD->clearDirectDependencies();
20881 for (const Use &U : SD->getInst()->operands()) {
20882 unsigned &NumOps =
20883 UserOpToNumOps
20884 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
20885 .first->getSecond();
20886 ++NumOps;
20887 if (auto *Op = dyn_cast<Instruction>(U.get());
20888 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
20889 *SLP, NumOps)) {
20890 if (ScheduleData *OpSD = getScheduleData(Op);
20891 OpSD && OpSD->hasValidDependencies()) {
20892 OpSD->clearDirectDependencies();
20893 if (RegionHasStackSave ||
20895 ControlDependentMembers.push_back(OpSD);
20896 }
20897 }
20898 }
20899 }
20900 };
20901 // The scheduling region got new instructions at the lower end (or it is a
20902 // new region for the first bundle). This makes it necessary to
20903 // recalculate all dependencies.
20904 // It is seldom that this needs to be done a second time after adding the
20905 // initial bundle to the region.
20906 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
20907 for_each(ScheduleDataMap, [&](auto &P) {
20908 if (BB != P.first->getParent())
20909 return;
20910 ScheduleData *SD = P.second;
20911 if (isInSchedulingRegion(*SD))
20912 SD->clearDependencies();
20913 });
20914 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
20915 for_each(P.second, [&](ScheduleCopyableData *SD) {
20916 if (isInSchedulingRegion(*SD))
20917 SD->clearDependencies();
20918 });
20919 });
20920 ReSchedule = true;
20921 }
20922 // Check if the bundle data has deps for copyable elements already. In
20923 // this case need to reset deps and recalculate it.
20924 if (Bundle && !Bundle.getBundle().empty()) {
20925 if (S.areInstructionsWithCopyableElements() ||
20926 !ScheduleCopyableDataMap.empty())
20927 CheckIfNeedToClearDeps(Bundle);
20928 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
20929 << BB->getName() << "\n");
20930 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
20931 ControlDependentMembers);
20932 } else if (!ControlDependentMembers.empty()) {
20933 ScheduleBundle Invalid = ScheduleBundle::invalid();
20934 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
20935 ControlDependentMembers);
20936 }
20937
20938 if (ReSchedule) {
20939 resetSchedule();
20940 initialFillReadyList(ReadyInsts);
20941 }
20942
20943 // Now try to schedule the new bundle or (if no bundle) just calculate
20944 // dependencies. As soon as the bundle is "ready" it means that there are no
20945 // cyclic dependencies and we can schedule it. Note that's important that we
20946 // don't "schedule" the bundle yet.
20947 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
20948 !ReadyInsts.empty()) {
20949 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
20950 assert(Picked->isReady() && "must be ready to schedule");
20951 schedule(*SLP, S, EI, Picked, ReadyInsts);
20952 if (Picked == &Bundle)
20953 break;
20954 }
20955 };
20956
20957 // Make sure that the scheduling region contains all
20958 // instructions of the bundle.
20959 for (Value *V : VL) {
20960 if (S.isNonSchedulable(V))
20961 continue;
20962 if (!extendSchedulingRegion(V, S)) {
20963 // If the scheduling region got new instructions at the lower end (or it
20964 // is a new region for the first bundle). This makes it necessary to
20965 // recalculate all dependencies.
20966 // Otherwise the compiler may crash trying to incorrectly calculate
20967 // dependencies and emit instruction in the wrong order at the actual
20968 // scheduling.
20969 ScheduleBundle Invalid = ScheduleBundle::invalid();
20970 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
20971 return std::nullopt;
20972 }
20973 }
20974
20975 bool ReSchedule = false;
20976 for (Value *V : VL) {
20977 if (S.isNonSchedulable(V))
20978 continue;
20980 getScheduleCopyableData(cast<Instruction>(V));
20981 if (!CopyableData.empty()) {
20982 for (ScheduleCopyableData *SD : CopyableData)
20983 ReadyInsts.remove(SD);
20984 }
20985 ScheduleData *BundleMember = getScheduleData(V);
20986 assert((BundleMember || S.isCopyableElement(V)) &&
20987 "no ScheduleData for bundle member (maybe not in same basic block)");
20988 if (!BundleMember)
20989 continue;
20990
20991 // Make sure we don't leave the pieces of the bundle in the ready list when
20992 // whole bundle might not be ready.
20993 ReadyInsts.remove(BundleMember);
20994 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
20995 !Bundles.empty()) {
20996 for (ScheduleBundle *B : Bundles)
20997 ReadyInsts.remove(B);
20998 }
20999
21000 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21001 continue;
21002 // A bundle member was scheduled as single instruction before and now
21003 // needs to be scheduled as part of the bundle. We just get rid of the
21004 // existing schedule.
21005 // A bundle member has deps calculated before it was copyable element - need
21006 // to reschedule.
21007 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
21008 << " was already scheduled\n");
21009 ReSchedule = true;
21010 }
21011
21012 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21013 TryScheduleBundleImpl(ReSchedule, Bundle);
21014 if (!Bundle.isReady()) {
21015 for (ScheduleEntity *BD : Bundle.getBundle()) {
21016 // Copyable data scheduling is just removed.
21018 continue;
21019 if (BD->isReady()) {
21020 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
21021 if (Bundles.empty()) {
21022 ReadyInsts.insert(BD);
21023 continue;
21024 }
21025 for (ScheduleBundle *B : Bundles)
21026 if (B->isReady())
21027 ReadyInsts.insert(B);
21028 }
21029 }
21030 ScheduledBundlesList.pop_back();
21031 SmallVector<ScheduleData *> ControlDependentMembers;
21032 SmallPtrSet<Instruction *, 4> Visited;
21033 for (Value *V : VL) {
21034 if (S.isNonSchedulable(V))
21035 continue;
21036 auto *I = cast<Instruction>(V);
21037 if (S.isCopyableElement(I)) {
21038 // Remove the copyable data from the scheduling region and restore
21039 // previous mappings.
21040 auto KV = std::make_pair(EI, I);
21041 assert(ScheduleCopyableDataMap.contains(KV) &&
21042 "no ScheduleCopyableData for copyable element");
21043 ScheduleCopyableData *SD =
21044 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
21045 ScheduleCopyableDataMapByUsers[I].remove(SD);
21046 if (EI.UserTE) {
21047 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21048 const auto *It = find(Op, I);
21049 assert(It != Op.end() && "Lane not set");
21050 SmallPtrSet<Instruction *, 4> Visited;
21051 do {
21052 int Lane = std::distance(Op.begin(), It);
21053 assert(Lane >= 0 && "Lane not set");
21054 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21055 !EI.UserTE->ReorderIndices.empty())
21056 Lane = EI.UserTE->ReorderIndices[Lane];
21057 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21058 "Couldn't find extract lane");
21059 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21060 if (!Visited.insert(In).second) {
21061 It = find(make_range(std::next(It), Op.end()), I);
21062 break;
21063 }
21064 ScheduleCopyableDataMapByInstUser
21065 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
21066 .pop_back();
21067 It = find(make_range(std::next(It), Op.end()), I);
21068 } while (It != Op.end());
21069 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
21070 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
21071 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
21072 }
21073 if (ScheduleCopyableDataMapByUsers[I].empty())
21074 ScheduleCopyableDataMapByUsers.erase(I);
21075 ScheduleCopyableDataMap.erase(KV);
21076 // Need to recalculate dependencies for the actual schedule data.
21077 if (ScheduleData *OpSD = getScheduleData(I);
21078 OpSD && OpSD->hasValidDependencies()) {
21079 OpSD->clearDirectDependencies();
21080 if (RegionHasStackSave ||
21082 ControlDependentMembers.push_back(OpSD);
21083 }
21084 continue;
21085 }
21086 ScheduledBundles.find(I)->getSecond().pop_back();
21087 }
21088 if (!ControlDependentMembers.empty()) {
21089 ScheduleBundle Invalid = ScheduleBundle::invalid();
21090 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
21091 ControlDependentMembers);
21092 }
21093 return std::nullopt;
21094 }
21095 return &Bundle;
21096}
21097
21098BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21099 // Allocate a new ScheduleData for the instruction.
21100 if (ChunkPos >= ChunkSize) {
21101 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21102 ChunkPos = 0;
21103 }
21104 return &(ScheduleDataChunks.back()[ChunkPos++]);
21105}
21106
21107bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21108 Value *V, const InstructionsState &S) {
21110 assert(I && "bundle member must be an instruction");
21111 if (getScheduleData(I))
21112 return true;
21113 if (!ScheduleStart) {
21114 // It's the first instruction in the new region.
21115 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
21116 ScheduleStart = I;
21117 ScheduleEnd = I->getNextNode();
21118 assert(ScheduleEnd && "tried to vectorize a terminator?");
21119 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
21120 return true;
21121 }
21122 // Search up and down at the same time, because we don't know if the new
21123 // instruction is above or below the existing scheduling region.
21124 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
21125 // against the budget. Otherwise debug info could affect codegen.
21127 ++ScheduleStart->getIterator().getReverse();
21128 BasicBlock::reverse_iterator UpperEnd = BB->rend();
21129 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
21130 BasicBlock::iterator LowerEnd = BB->end();
21131 auto IsAssumeLikeIntr = [](const Instruction &I) {
21132 if (auto *II = dyn_cast<IntrinsicInst>(&I))
21133 return II->isAssumeLikeIntrinsic();
21134 return false;
21135 };
21136 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21137 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21138 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
21139 &*DownIter != I) {
21140 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21141 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
21142 return false;
21143 }
21144
21145 ++UpIter;
21146 ++DownIter;
21147
21148 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21149 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21150 }
21151 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
21152 assert(I->getParent() == ScheduleStart->getParent() &&
21153 "Instruction is in wrong basic block.");
21154 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
21155 ScheduleStart = I;
21156 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
21157 << "\n");
21158 return true;
21159 }
21160 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
21161 "Expected to reach top of the basic block or instruction down the "
21162 "lower end.");
21163 assert(I->getParent() == ScheduleEnd->getParent() &&
21164 "Instruction is in wrong basic block.");
21165 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
21166 nullptr);
21167 ScheduleEnd = I->getNextNode();
21168 assert(ScheduleEnd && "tried to vectorize a terminator?");
21169 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
21170 return true;
21171}
21172
21173void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21174 Instruction *ToI,
21175 ScheduleData *PrevLoadStore,
21176 ScheduleData *NextLoadStore) {
21177 ScheduleData *CurrentLoadStore = PrevLoadStore;
21178 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
21179 // No need to allocate data for non-schedulable instructions.
21180 if (isa<PHINode>(I))
21181 continue;
21182 ScheduleData *SD = ScheduleDataMap.lookup(I);
21183 if (!SD) {
21184 SD = allocateScheduleDataChunks();
21185 ScheduleDataMap[I] = SD;
21186 }
21187 assert(!isInSchedulingRegion(*SD) &&
21188 "new ScheduleData already in scheduling region");
21189 SD->init(SchedulingRegionID, I);
21190
21191 if (I->mayReadOrWriteMemory() &&
21192 (!isa<IntrinsicInst>(I) ||
21193 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
21194 cast<IntrinsicInst>(I)->getIntrinsicID() !=
21195 Intrinsic::pseudoprobe))) {
21196 // Update the linked list of memory accessing instructions.
21197 if (CurrentLoadStore) {
21198 CurrentLoadStore->setNextLoadStore(SD);
21199 } else {
21200 FirstLoadStoreInRegion = SD;
21201 }
21202 CurrentLoadStore = SD;
21203 }
21204
21207 RegionHasStackSave = true;
21208 }
21209 if (NextLoadStore) {
21210 if (CurrentLoadStore)
21211 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21212 } else {
21213 LastLoadStoreInRegion = CurrentLoadStore;
21214 }
21215}
21216
21217void BoUpSLP::BlockScheduling::calculateDependencies(
21218 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
21219 ArrayRef<ScheduleData *> ControlDeps) {
21220 SmallVector<ScheduleEntity *> WorkList;
21221 auto ProcessNode = [&](ScheduleEntity *SE) {
21222 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21223 if (CD->hasValidDependencies())
21224 return;
21225 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
21226 CD->initDependencies();
21227 CD->resetUnscheduledDeps();
21228 const EdgeInfo &EI = CD->getEdgeInfo();
21229 if (EI.UserTE) {
21230 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21231 const auto *It = find(Op, CD->getInst());
21232 assert(It != Op.end() && "Lane not set");
21233 SmallPtrSet<Instruction *, 4> Visited;
21234 do {
21235 int Lane = std::distance(Op.begin(), It);
21236 assert(Lane >= 0 && "Lane not set");
21237 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21238 !EI.UserTE->ReorderIndices.empty())
21239 Lane = EI.UserTE->ReorderIndices[Lane];
21240 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21241 "Couldn't find extract lane");
21242 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21243 if (EI.UserTE->isCopyableElement(In)) {
21244 // We may have not have related copyable scheduling data, if the
21245 // instruction is non-schedulable.
21246 if (ScheduleCopyableData *UseSD =
21247 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21248 CD->incDependencies();
21249 if (!UseSD->isScheduled())
21250 CD->incrementUnscheduledDeps(1);
21251 if (!UseSD->hasValidDependencies() ||
21252 (InsertInReadyList && UseSD->isReady()))
21253 WorkList.push_back(UseSD);
21254 }
21255 } else if (Visited.insert(In).second) {
21256 if (ScheduleData *UseSD = getScheduleData(In)) {
21257 CD->incDependencies();
21258 if (!UseSD->isScheduled())
21259 CD->incrementUnscheduledDeps(1);
21260 if (!UseSD->hasValidDependencies() ||
21261 (InsertInReadyList && UseSD->isReady()))
21262 WorkList.push_back(UseSD);
21263 }
21264 }
21265 It = find(make_range(std::next(It), Op.end()), CD->getInst());
21266 } while (It != Op.end());
21267 if (CD->isReady() && CD->getDependencies() == 0 &&
21268 (EI.UserTE->hasState() &&
21269 (EI.UserTE->getMainOp()->getParent() !=
21270 CD->getInst()->getParent() ||
21271 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21272 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
21273 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21274 auto *IU = dyn_cast<Instruction>(U);
21275 if (!IU)
21276 return true;
21277 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21278 })))))) {
21279 // If no uses in the block - mark as having pseudo-use, which cannot
21280 // be scheduled.
21281 // Prevents incorrect def-use tracking between external user and
21282 // actual instruction.
21283 CD->incDependencies();
21284 CD->incrementUnscheduledDeps(1);
21285 }
21286 }
21287 return;
21288 }
21289 auto *BundleMember = cast<ScheduleData>(SE);
21290 if (BundleMember->hasValidDependencies())
21291 return;
21292 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
21293 BundleMember->initDependencies();
21294 BundleMember->resetUnscheduledDeps();
21295 // Handle def-use chain dependencies.
21296 SmallDenseMap<Value *, unsigned> UserToNumOps;
21297 for (User *U : BundleMember->getInst()->users()) {
21298 if (isa<PHINode>(U))
21299 continue;
21300 if (ScheduleData *UseSD = getScheduleData(U)) {
21301 // The operand is a copyable element - skip.
21302 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
21303 ++NumOps;
21304 if (areAllOperandsReplacedByCopyableData(
21305 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21306 continue;
21307 BundleMember->incDependencies();
21308 if (!UseSD->isScheduled())
21309 BundleMember->incrementUnscheduledDeps(1);
21310 if (!UseSD->hasValidDependencies() ||
21311 (InsertInReadyList && UseSD->isReady()))
21312 WorkList.push_back(UseSD);
21313 }
21314 }
21315 for (ScheduleCopyableData *UseSD :
21316 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21317 BundleMember->incDependencies();
21318 if (!UseSD->isScheduled())
21319 BundleMember->incrementUnscheduledDeps(1);
21320 if (!UseSD->hasValidDependencies() ||
21321 (InsertInReadyList && UseSD->isReady()))
21322 WorkList.push_back(UseSD);
21323 }
21324
21325 SmallPtrSet<const Instruction *, 4> Visited;
21326 auto MakeControlDependent = [&](Instruction *I) {
21327 // Do not mark control dependent twice.
21328 if (!Visited.insert(I).second)
21329 return;
21330 auto *DepDest = getScheduleData(I);
21331 assert(DepDest && "must be in schedule window");
21332 DepDest->addControlDependency(BundleMember);
21333 BundleMember->incDependencies();
21334 if (!DepDest->isScheduled())
21335 BundleMember->incrementUnscheduledDeps(1);
21336 if (!DepDest->hasValidDependencies() ||
21337 (InsertInReadyList && DepDest->isReady()))
21338 WorkList.push_back(DepDest);
21339 };
21340
21341 // Any instruction which isn't safe to speculate at the beginning of the
21342 // block is control depend on any early exit or non-willreturn call
21343 // which proceeds it.
21344 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
21345 for (Instruction *I = BundleMember->getInst()->getNextNode();
21346 I != ScheduleEnd; I = I->getNextNode()) {
21347 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
21348 continue;
21349
21350 // Add the dependency
21351 MakeControlDependent(I);
21352
21354 // Everything past here must be control dependent on I.
21355 break;
21356 }
21357 }
21358
21359 if (RegionHasStackSave) {
21360 // If we have an inalloc alloca instruction, it needs to be scheduled
21361 // after any preceeding stacksave. We also need to prevent any alloca
21362 // from reordering above a preceeding stackrestore.
21363 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21364 match(BundleMember->getInst(),
21366 for (Instruction *I = BundleMember->getInst()->getNextNode();
21367 I != ScheduleEnd; I = I->getNextNode()) {
21370 // Any allocas past here must be control dependent on I, and I
21371 // must be memory dependend on BundleMember->Inst.
21372 break;
21373
21374 if (!isa<AllocaInst>(I))
21375 continue;
21376
21377 // Add the dependency
21378 MakeControlDependent(I);
21379 }
21380 }
21381
21382 // In addition to the cases handle just above, we need to prevent
21383 // allocas and loads/stores from moving below a stacksave or a
21384 // stackrestore. Avoiding moving allocas below stackrestore is currently
21385 // thought to be conservatism. Moving loads/stores below a stackrestore
21386 // can lead to incorrect code.
21387 if (isa<AllocaInst>(BundleMember->getInst()) ||
21388 BundleMember->getInst()->mayReadOrWriteMemory()) {
21389 for (Instruction *I = BundleMember->getInst()->getNextNode();
21390 I != ScheduleEnd; I = I->getNextNode()) {
21393 continue;
21394
21395 // Add the dependency
21396 MakeControlDependent(I);
21397 break;
21398 }
21399 }
21400 }
21401
21402 // Handle the memory dependencies (if any).
21403 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21404 if (!NextLoadStore)
21405 return;
21406 Instruction *SrcInst = BundleMember->getInst();
21407 assert(SrcInst->mayReadOrWriteMemory() &&
21408 "NextLoadStore list for non memory effecting bundle?");
21409 MemoryLocation SrcLoc = getLocation(SrcInst);
21410 bool SrcMayWrite = SrcInst->mayWriteToMemory();
21411 unsigned NumAliased = 0;
21412 unsigned DistToSrc = 1;
21413 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
21414
21415 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21416 DepDest = DepDest->getNextLoadStore()) {
21417 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
21418
21419 // We have two limits to reduce the complexity:
21420 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
21421 // SLP->isAliased (which is the expensive part in this loop).
21422 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
21423 // the whole loop (even if the loop is fast, it's quadratic).
21424 // It's important for the loop break condition (see below) to
21425 // check this limit even between two read-only instructions.
21426 if (DistToSrc >= MaxMemDepDistance ||
21427 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21428 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
21429 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21430
21431 // We increment the counter only if the locations are aliased
21432 // (instead of counting all alias checks). This gives a better
21433 // balance between reduced runtime and accurate dependencies.
21434 NumAliased++;
21435
21436 DepDest->addMemoryDependency(BundleMember);
21437 BundleMember->incDependencies();
21438 if (!DepDest->isScheduled())
21439 BundleMember->incrementUnscheduledDeps(1);
21440 if (!DepDest->hasValidDependencies() ||
21441 (InsertInReadyList && DepDest->isReady()))
21442 WorkList.push_back(DepDest);
21443 }
21444
21445 // Example, explaining the loop break condition: Let's assume our
21446 // starting instruction is i0 and MaxMemDepDistance = 3.
21447 //
21448 // +--------v--v--v
21449 // i0,i1,i2,i3,i4,i5,i6,i7,i8
21450 // +--------^--^--^
21451 //
21452 // MaxMemDepDistance let us stop alias-checking at i3 and we add
21453 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
21454 // Previously we already added dependencies from i3 to i6,i7,i8
21455 // (because of MaxMemDepDistance). As we added a dependency from
21456 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
21457 // and we can abort this loop at i6.
21458 if (DistToSrc >= 2 * MaxMemDepDistance)
21459 break;
21460 DistToSrc++;
21461 }
21462 };
21463
21464 assert((Bundle || !ControlDeps.empty()) &&
21465 "expected at least one instruction to schedule");
21466 if (Bundle)
21467 WorkList.push_back(Bundle.getBundle().front());
21468 WorkList.append(ControlDeps.begin(), ControlDeps.end());
21469 SmallPtrSet<ScheduleBundle *, 16> Visited;
21470 while (!WorkList.empty()) {
21471 ScheduleEntity *SD = WorkList.pop_back_val();
21472 SmallVector<ScheduleBundle *, 1> CopyableBundle;
21474 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21475 CopyableBundle.push_back(&CD->getBundle());
21476 Bundles = CopyableBundle;
21477 } else {
21478 Bundles = getScheduleBundles(SD->getInst());
21479 }
21480 if (Bundles.empty()) {
21481 if (!SD->hasValidDependencies())
21482 ProcessNode(SD);
21483 if (InsertInReadyList && SD->isReady()) {
21484 ReadyInsts.insert(SD);
21485 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
21486 }
21487 continue;
21488 }
21489 for (ScheduleBundle *Bundle : Bundles) {
21490 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
21491 continue;
21492 assert(isInSchedulingRegion(*Bundle) &&
21493 "ScheduleData not in scheduling region");
21494 for_each(Bundle->getBundle(), ProcessNode);
21495 }
21496 if (InsertInReadyList && SD->isReady()) {
21497 for (ScheduleBundle *Bundle : Bundles) {
21498 assert(isInSchedulingRegion(*Bundle) &&
21499 "ScheduleData not in scheduling region");
21500 if (!Bundle->isReady())
21501 continue;
21502 ReadyInsts.insert(Bundle);
21503 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
21504 << "\n");
21505 }
21506 }
21507 }
21508}
21509
21510void BoUpSLP::BlockScheduling::resetSchedule() {
21511 assert(ScheduleStart &&
21512 "tried to reset schedule on block which has not been scheduled");
21513 for_each(ScheduleDataMap, [&](auto &P) {
21514 if (BB != P.first->getParent())
21515 return;
21516 ScheduleData *SD = P.second;
21517 if (isInSchedulingRegion(*SD)) {
21518 SD->setScheduled(/*Scheduled=*/false);
21519 SD->resetUnscheduledDeps();
21520 }
21521 });
21522 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21523 for_each(P.second, [&](ScheduleCopyableData *SD) {
21524 if (isInSchedulingRegion(*SD)) {
21525 SD->setScheduled(/*Scheduled=*/false);
21526 SD->resetUnscheduledDeps();
21527 }
21528 });
21529 });
21530 for_each(ScheduledBundles, [&](auto &P) {
21531 for_each(P.second, [&](ScheduleBundle *Bundle) {
21532 if (isInSchedulingRegion(*Bundle))
21533 Bundle->setScheduled(/*Scheduled=*/false);
21534 });
21535 });
21536 // Reset schedule data for copyable elements.
21537 for (auto &P : ScheduleCopyableDataMap) {
21538 if (isInSchedulingRegion(*P.second)) {
21539 P.second->setScheduled(/*Scheduled=*/false);
21540 P.second->resetUnscheduledDeps();
21541 }
21542 }
21543 ReadyInsts.clear();
21544}
21545
21546void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
21547 if (!BS->ScheduleStart)
21548 return;
21549
21550 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
21551
21552 // A key point - if we got here, pre-scheduling was able to find a valid
21553 // scheduling of the sub-graph of the scheduling window which consists
21554 // of all vector bundles and their transitive users. As such, we do not
21555 // need to reschedule anything *outside of* that subgraph.
21556
21557 BS->resetSchedule();
21558
21559 // For the real scheduling we use a more sophisticated ready-list: it is
21560 // sorted by the original instruction location. This lets the final schedule
21561 // be as close as possible to the original instruction order.
21562 // WARNING: If changing this order causes a correctness issue, that means
21563 // there is some missing dependence edge in the schedule data graph.
21564 struct ScheduleDataCompare {
21565 bool operator()(const ScheduleEntity *SD1,
21566 const ScheduleEntity *SD2) const {
21567 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21568 }
21569 };
21570 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21571
21572 // Ensure that all dependency data is updated (for nodes in the sub-graph)
21573 // and fill the ready-list with initial instructions.
21574 int Idx = 0;
21575 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21576 I = I->getNextNode()) {
21577 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21578 if (!Bundles.empty()) {
21579 for (ScheduleBundle *Bundle : Bundles) {
21580 Bundle->setSchedulingPriority(Idx++);
21581 if (!Bundle->hasValidDependencies())
21582 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
21583 }
21584 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
21585 for (ScheduleCopyableData *SD : reverse(SDs)) {
21586 ScheduleBundle &Bundle = SD->getBundle();
21587 Bundle.setSchedulingPriority(Idx++);
21588 if (!Bundle.hasValidDependencies())
21589 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21590 }
21591 continue;
21592 }
21594 BS->getScheduleCopyableDataUsers(I);
21595 if (ScheduleData *SD = BS->getScheduleData(I)) {
21596 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
21597 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
21598 SDTEs.front()->doesNotNeedToSchedule() ||
21600 "scheduler and vectorizer bundle mismatch");
21601 SD->setSchedulingPriority(Idx++);
21602 if (!SD->hasValidDependencies() &&
21603 (!CopyableData.empty() ||
21604 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
21605 assert(TE->isGather() && "expected gather node");
21606 return TE->hasState() && TE->hasCopyableElements() &&
21607 TE->isCopyableElement(I);
21608 }))) {
21609 // Need to calculate deps for these nodes to correctly handle copyable
21610 // dependencies, even if they were cancelled.
21611 // If copyables bundle was cancelled, the deps are cleared and need to
21612 // recalculate them.
21613 ScheduleBundle Bundle;
21614 Bundle.add(SD);
21615 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21616 }
21617 }
21618 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
21619 ScheduleBundle &Bundle = SD->getBundle();
21620 Bundle.setSchedulingPriority(Idx++);
21621 if (!Bundle.hasValidDependencies())
21622 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21623 }
21624 }
21625 BS->initialFillReadyList(ReadyInsts);
21626
21627 Instruction *LastScheduledInst = BS->ScheduleEnd;
21628
21629 // Do the "real" scheduling.
21630 SmallPtrSet<Instruction *, 16> Scheduled;
21631 while (!ReadyInsts.empty()) {
21632 auto *Picked = *ReadyInsts.begin();
21633 ReadyInsts.erase(ReadyInsts.begin());
21634
21635 // Move the scheduled instruction(s) to their dedicated places, if not
21636 // there yet.
21637 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
21638 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21639 Instruction *PickedInst = BundleMember->getInst();
21640 // If copyable must be schedule as part of something else, skip it.
21641 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21642 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21643 (!IsCopyable && !Scheduled.insert(PickedInst).second))
21644 continue;
21645 if (PickedInst->getNextNode() != LastScheduledInst)
21646 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21647 LastScheduledInst = PickedInst;
21648 }
21649 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21650 LastScheduledInst);
21651 } else {
21652 auto *SD = cast<ScheduleData>(Picked);
21653 Instruction *PickedInst = SD->getInst();
21654 if (PickedInst->getNextNode() != LastScheduledInst)
21655 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21656 LastScheduledInst = PickedInst;
21657 }
21658 auto Invalid = InstructionsState::invalid();
21659 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
21660 }
21661
21662 // Check that we didn't break any of our invariants.
21663#ifdef EXPENSIVE_CHECKS
21664 BS->verify();
21665#endif
21666
21667#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21668 // Check that all schedulable entities got scheduled
21669 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21670 I = I->getNextNode()) {
21671 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21672 assert(all_of(Bundles,
21673 [](const ScheduleBundle *Bundle) {
21674 return Bundle->isScheduled();
21675 }) &&
21676 "must be scheduled at this point");
21677 }
21678#endif
21679
21680 // Avoid duplicate scheduling of the block.
21681 BS->ScheduleStart = nullptr;
21682}
21683
21685 // If V is a store, just return the width of the stored value (or value
21686 // truncated just before storing) without traversing the expression tree.
21687 // This is the common case.
21688 if (auto *Store = dyn_cast<StoreInst>(V))
21689 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21690
21691 if (auto *IEI = dyn_cast<InsertElementInst>(V))
21692 return getVectorElementSize(IEI->getOperand(1));
21693
21694 auto E = InstrElementSize.find(V);
21695 if (E != InstrElementSize.end())
21696 return E->second;
21697
21698 // If V is not a store, we can traverse the expression tree to find loads
21699 // that feed it. The type of the loaded value may indicate a more suitable
21700 // width than V's type. We want to base the vector element size on the width
21701 // of memory operations where possible.
21704 if (auto *I = dyn_cast<Instruction>(V)) {
21705 Worklist.emplace_back(I, I->getParent(), 0);
21706 Visited.insert(I);
21707 }
21708
21709 // Traverse the expression tree in bottom-up order looking for loads. If we
21710 // encounter an instruction we don't yet handle, we give up.
21711 auto Width = 0u;
21712 Value *FirstNonBool = nullptr;
21713 while (!Worklist.empty()) {
21714 auto [I, Parent, Level] = Worklist.pop_back_val();
21715
21716 // We should only be looking at scalar instructions here. If the current
21717 // instruction has a vector type, skip.
21718 auto *Ty = I->getType();
21719 if (isa<VectorType>(Ty))
21720 continue;
21721 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21722 FirstNonBool = I;
21723 if (Level > RecursionMaxDepth)
21724 continue;
21725
21726 // If the current instruction is a load, update MaxWidth to reflect the
21727 // width of the loaded value.
21729 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21730
21731 // Otherwise, we need to visit the operands of the instruction. We only
21732 // handle the interesting cases from buildTree here. If an operand is an
21733 // instruction we haven't yet visited and from the same basic block as the
21734 // user or the use is a PHI node, we add it to the worklist.
21737 for (Use &U : I->operands()) {
21738 if (auto *J = dyn_cast<Instruction>(U.get()))
21739 if (Visited.insert(J).second &&
21740 (isa<PHINode>(I) || J->getParent() == Parent)) {
21741 Worklist.emplace_back(J, J->getParent(), Level + 1);
21742 continue;
21743 }
21744 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
21745 FirstNonBool = U.get();
21746 }
21747 } else {
21748 break;
21749 }
21750 }
21751
21752 // If we didn't encounter a memory access in the expression tree, or if we
21753 // gave up for some reason, just return the width of V. Otherwise, return the
21754 // maximum width we found.
21755 if (!Width) {
21756 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21757 V = FirstNonBool;
21758 Width = DL->getTypeSizeInBits(V->getType());
21759 }
21760
21761 for (Instruction *I : Visited)
21762 InstrElementSize[I] = Width;
21763
21764 return Width;
21765}
21766
21767bool BoUpSLP::collectValuesToDemote(
21768 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
21770 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
21771 bool &IsProfitableToDemote, bool IsTruncRoot) const {
21772 // We can always demote constants.
21773 if (all_of(E.Scalars, IsaPred<Constant>))
21774 return true;
21775
21776 unsigned OrigBitWidth =
21777 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21778 if (OrigBitWidth == BitWidth) {
21779 MaxDepthLevel = 1;
21780 return true;
21781 }
21782
21783 // Check if the node was analyzed already and must keep its original bitwidth.
21784 if (NodesToKeepBWs.contains(E.Idx))
21785 return false;
21786
21787 // If the value is not a vectorized instruction in the expression and not used
21788 // by the insertelement instruction and not used in multiple vector nodes, it
21789 // cannot be demoted.
21790 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
21791 if (isa<PoisonValue>(R))
21792 return false;
21793 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21794 });
21795 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
21796 if (isa<PoisonValue>(V))
21797 return true;
21798 if (getTreeEntries(V).size() > 1)
21799 return false;
21800 // For lat shuffle of sext/zext with many uses need to check the extra bit
21801 // for unsigned values, otherwise may have incorrect casting for reused
21802 // scalars.
21803 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
21804 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
21805 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21806 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21807 return true;
21808 }
21809 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
21810 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21811 if (IsSignedNode)
21812 ++BitWidth1;
21813 if (auto *I = dyn_cast<Instruction>(V)) {
21814 APInt Mask = DB->getDemandedBits(I);
21815 unsigned BitWidth2 =
21816 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21817 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21818 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
21819 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21820 break;
21821 BitWidth2 *= 2;
21822 }
21823 BitWidth1 = std::min(BitWidth1, BitWidth2);
21824 }
21825 BitWidth = std::max(BitWidth, BitWidth1);
21826 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
21827 };
21828 auto FinalAnalysis = [&, TTI = TTI]() {
21829 if (!IsProfitableToDemote)
21830 return false;
21831 bool Res = all_of(
21832 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
21833 // Demote gathers.
21834 if (Res && E.isGather()) {
21835 if (E.hasState()) {
21836 if (const TreeEntry *SameTE =
21837 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
21838 SameTE)
21839 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
21840 ToDemote, Visited, NodesToKeepBWs,
21841 MaxDepthLevel, IsProfitableToDemote,
21842 IsTruncRoot)) {
21843 ToDemote.push_back(E.Idx);
21844 return true;
21845 }
21846 }
21847 // Check possible extractelement instructions bases and final vector
21848 // length.
21849 SmallPtrSet<Value *, 4> UniqueBases;
21850 for (Value *V : E.Scalars) {
21851 auto *EE = dyn_cast<ExtractElementInst>(V);
21852 if (!EE)
21853 continue;
21854 UniqueBases.insert(EE->getVectorOperand());
21855 }
21856 const unsigned VF = E.Scalars.size();
21857 Type *OrigScalarTy = E.Scalars.front()->getType();
21858 if (UniqueBases.size() <= 2 ||
21859 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
21861 *TTI,
21863 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
21864 VF))) {
21865 ToDemote.push_back(E.Idx);
21866 return true;
21867 }
21868 }
21869 return Res;
21870 };
21871 if (E.isGather() || !Visited.insert(&E).second ||
21872 any_of(E.Scalars, [&](Value *V) {
21873 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
21874 return isa<InsertElementInst>(U) && !isVectorized(U);
21875 });
21876 }))
21877 return FinalAnalysis();
21878
21879 if (any_of(E.Scalars, [&](Value *V) {
21880 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
21881 return isVectorized(U) ||
21882 (E.Idx == 0 && UserIgnoreList &&
21883 UserIgnoreList->contains(U)) ||
21884 (!isa<CmpInst>(U) && U->getType()->isSized() &&
21885 !U->getType()->isScalableTy() &&
21886 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
21887 }) && !IsPotentiallyTruncated(V, BitWidth);
21888 }))
21889 return false;
21890
21891 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
21892 bool &NeedToExit) {
21893 NeedToExit = false;
21894 unsigned InitLevel = MaxDepthLevel;
21895 for (const TreeEntry *Op : Operands) {
21896 unsigned Level = InitLevel;
21897 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
21898 ToDemote, Visited, NodesToKeepBWs, Level,
21899 IsProfitableToDemote, IsTruncRoot)) {
21900 if (!IsProfitableToDemote)
21901 return false;
21902 NeedToExit = true;
21903 if (!FinalAnalysis())
21904 return false;
21905 continue;
21906 }
21907 MaxDepthLevel = std::max(MaxDepthLevel, Level);
21908 }
21909 return true;
21910 };
21911 auto AttemptCheckBitwidth =
21912 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
21913 // Try all bitwidth < OrigBitWidth.
21914 NeedToExit = false;
21915 unsigned BestFailBitwidth = 0;
21916 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
21917 if (Checker(BitWidth, OrigBitWidth))
21918 return true;
21919 if (BestFailBitwidth == 0 && FinalAnalysis())
21920 BestFailBitwidth = BitWidth;
21921 }
21922 if (BitWidth >= OrigBitWidth) {
21923 if (BestFailBitwidth == 0) {
21924 BitWidth = OrigBitWidth;
21925 return false;
21926 }
21927 MaxDepthLevel = 1;
21928 BitWidth = BestFailBitwidth;
21929 NeedToExit = true;
21930 return true;
21931 }
21932 return false;
21933 };
21934 auto TryProcessInstruction =
21935 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
21936 function_ref<bool(unsigned, unsigned)> Checker = {}) {
21937 if (Operands.empty()) {
21938 if (!IsTruncRoot)
21939 MaxDepthLevel = 1;
21940 for (Value *V : E.Scalars)
21941 (void)IsPotentiallyTruncated(V, BitWidth);
21942 } else {
21943 // Several vectorized uses? Check if we can truncate it, otherwise -
21944 // exit.
21945 if (any_of(E.Scalars, [&](Value *V) {
21946 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
21947 }))
21948 return false;
21949 bool NeedToExit = false;
21950 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
21951 return false;
21952 if (NeedToExit)
21953 return true;
21954 if (!ProcessOperands(Operands, NeedToExit))
21955 return false;
21956 if (NeedToExit)
21957 return true;
21958 }
21959
21960 ++MaxDepthLevel;
21961 // Record the entry that we can demote.
21962 ToDemote.push_back(E.Idx);
21963 return IsProfitableToDemote;
21964 };
21965
21966 if (E.State == TreeEntry::SplitVectorize)
21967 return TryProcessInstruction(
21968 BitWidth,
21969 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
21970 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
21971
21972 switch (E.getOpcode()) {
21973
21974 // We can always demote truncations and extensions. Since truncations can
21975 // seed additional demotion, we save the truncated value.
21976 case Instruction::Trunc:
21977 if (IsProfitableToDemoteRoot)
21978 IsProfitableToDemote = true;
21979 return TryProcessInstruction(BitWidth);
21980 case Instruction::ZExt:
21981 case Instruction::SExt:
21982 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
21983 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
21984 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
21985 return false;
21986 IsProfitableToDemote = true;
21987 return TryProcessInstruction(BitWidth);
21988
21989 // We can demote certain binary operations if we can demote both of their
21990 // operands.
21991 case Instruction::Add:
21992 case Instruction::Sub:
21993 case Instruction::Mul:
21994 case Instruction::And:
21995 case Instruction::Or:
21996 case Instruction::Xor: {
21997 return TryProcessInstruction(
21998 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
21999 }
22000 case Instruction::Freeze:
22001 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
22002 case Instruction::Shl: {
22003 // If we are truncating the result of this SHL, and if it's a shift of an
22004 // inrange amount, we can always perform a SHL in a smaller type.
22005 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
22006 return all_of(E.Scalars, [&](Value *V) {
22007 if (isa<PoisonValue>(V))
22008 return true;
22009 auto *I = cast<Instruction>(V);
22010 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22011 return AmtKnownBits.getMaxValue().ult(BitWidth);
22012 });
22013 };
22014 return TryProcessInstruction(
22015 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
22016 }
22017 case Instruction::LShr: {
22018 // If this is a truncate of a logical shr, we can truncate it to a smaller
22019 // lshr iff we know that the bits we would otherwise be shifting in are
22020 // already zeros.
22021 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22022 return all_of(E.Scalars, [&](Value *V) {
22023 if (isa<PoisonValue>(V))
22024 return true;
22025 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22026 if (E.isCopyableElement(V))
22027 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22028 auto *I = cast<Instruction>(V);
22029 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22030 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22031 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22032 SimplifyQuery(*DL));
22033 });
22034 };
22035 return TryProcessInstruction(
22036 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22037 LShrChecker);
22038 }
22039 case Instruction::AShr: {
22040 // If this is a truncate of an arithmetic shr, we can truncate it to a
22041 // smaller ashr iff we know that all the bits from the sign bit of the
22042 // original type and the sign bit of the truncate type are similar.
22043 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22044 return all_of(E.Scalars, [&](Value *V) {
22045 if (isa<PoisonValue>(V))
22046 return true;
22047 auto *I = cast<Instruction>(V);
22048 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22049 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22050 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22051 ShiftedBits <
22052 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22053 });
22054 };
22055 return TryProcessInstruction(
22056 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22057 AShrChecker);
22058 }
22059 case Instruction::UDiv:
22060 case Instruction::URem: {
22061 // UDiv and URem can be truncated if all the truncated bits are zero.
22062 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22063 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22064 return all_of(E.Scalars, [&](Value *V) {
22065 auto *I = cast<Instruction>(V);
22066 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22067 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22068 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22069 });
22070 };
22071 return TryProcessInstruction(
22072 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
22073 }
22074
22075 // We can demote selects if we can demote their true and false values.
22076 case Instruction::Select: {
22077 return TryProcessInstruction(
22078 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
22079 }
22080
22081 // We can demote phis if we can demote all their incoming operands.
22082 case Instruction::PHI: {
22083 const unsigned NumOps = E.getNumOperands();
22085 transform(seq<unsigned>(0, NumOps), Ops.begin(),
22086 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
22087
22088 return TryProcessInstruction(BitWidth, Ops);
22089 }
22090
22091 case Instruction::Call: {
22092 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
22093 if (!IC)
22094 break;
22096 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
22097 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
22098 break;
22099 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
22100 function_ref<bool(unsigned, unsigned)> CallChecker;
22101 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22102 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22103 return all_of(E.Scalars, [&](Value *V) {
22104 auto *I = cast<Instruction>(V);
22105 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22106 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22107 return MaskedValueIsZero(I->getOperand(0), Mask,
22108 SimplifyQuery(*DL)) &&
22109 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22110 }
22111 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
22112 "Expected min/max intrinsics only.");
22113 unsigned SignBits = OrigBitWidth - BitWidth;
22114 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22115 unsigned Op0SignBits =
22116 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22117 unsigned Op1SignBits =
22118 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
22119 return SignBits <= Op0SignBits &&
22120 ((SignBits != Op0SignBits &&
22121 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22122 MaskedValueIsZero(I->getOperand(0), Mask,
22123 SimplifyQuery(*DL))) &&
22124 SignBits <= Op1SignBits &&
22125 ((SignBits != Op1SignBits &&
22126 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
22127 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
22128 });
22129 };
22130 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22131 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22132 return all_of(E.Scalars, [&](Value *V) {
22133 auto *I = cast<Instruction>(V);
22134 unsigned SignBits = OrigBitWidth - BitWidth;
22135 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22136 unsigned Op0SignBits =
22137 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22138 return SignBits <= Op0SignBits &&
22139 ((SignBits != Op0SignBits &&
22140 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22141 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22142 });
22143 };
22144 if (ID != Intrinsic::abs) {
22145 Operands.push_back(getOperandEntry(&E, 1));
22146 CallChecker = CompChecker;
22147 } else {
22148 CallChecker = AbsChecker;
22149 }
22150 InstructionCost BestCost =
22151 std::numeric_limits<InstructionCost::CostType>::max();
22152 unsigned BestBitWidth = BitWidth;
22153 unsigned VF = E.Scalars.size();
22154 // Choose the best bitwidth based on cost estimations.
22155 auto Checker = [&](unsigned BitWidth, unsigned) {
22156 unsigned MinBW = PowerOf2Ceil(BitWidth);
22157 SmallVector<Type *> ArgTys =
22158 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
22159 auto VecCallCosts = getVectorCallCosts(
22160 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
22161 TTI, TLI, ArgTys);
22162 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
22163 if (Cost < BestCost) {
22164 BestCost = Cost;
22165 BestBitWidth = BitWidth;
22166 }
22167 return false;
22168 };
22169 [[maybe_unused]] bool NeedToExit;
22170 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22171 BitWidth = BestBitWidth;
22172 return TryProcessInstruction(BitWidth, Operands, CallChecker);
22173 }
22174
22175 // Otherwise, conservatively give up.
22176 default:
22177 break;
22178 }
22179 MaxDepthLevel = 1;
22180 return FinalAnalysis();
22181}
22182
22183static RecurKind getRdxKind(Value *V);
22184
22186 // We only attempt to truncate integer expressions.
22187 bool IsStoreOrInsertElt =
22188 VectorizableTree.front()->hasState() &&
22189 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22190 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22191 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22192 ExtraBitWidthNodes.size() <= 1 &&
22193 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22194 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22195 return;
22196
22197 unsigned NodeIdx = 0;
22198 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22199 NodeIdx = 1;
22200
22201 // Ensure the roots of the vectorizable tree don't form a cycle.
22202 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
22203 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22204 "Unexpected tree is graph.");
22205
22206 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
22207 // resize to the final type.
22208 bool IsTruncRoot = false;
22209 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22210 SmallVector<unsigned> RootDemotes;
22211 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
22212 if (NodeIdx != 0 &&
22213 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22214 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22215 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
22216 IsTruncRoot = true;
22217 RootDemotes.push_back(NodeIdx);
22218 IsProfitableToDemoteRoot = true;
22219 ++NodeIdx;
22220 }
22221
22222 // Analyzed the reduction already and not profitable - exit.
22223 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22224 return;
22225
22226 SmallVector<unsigned> ToDemote;
22227 auto ComputeMaxBitWidth =
22228 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
22229 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
22230 ToDemote.clear();
22231 // Check if the root is trunc and the next node is gather/buildvector, then
22232 // keep trunc in scalars, which is free in most cases.
22233 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22234 !NodesToKeepBWs.contains(E.Idx) &&
22235 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22236 all_of(E.Scalars, [&](Value *V) {
22237 return V->hasOneUse() || isa<Constant>(V) ||
22238 (!V->hasNUsesOrMore(UsesLimit) &&
22239 none_of(V->users(), [&](User *U) {
22240 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22241 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22242 if (TEs.empty() || is_contained(TEs, UserTE))
22243 return false;
22244 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22245 SelectInst>(U) ||
22246 isa<SIToFPInst, UIToFPInst>(U) ||
22247 (UserTE->hasState() &&
22248 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22249 SelectInst>(UserTE->getMainOp()) ||
22250 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22251 return true;
22252 unsigned UserTESz = DL->getTypeSizeInBits(
22253 UserTE->Scalars.front()->getType());
22254 if (all_of(TEs, [&](const TreeEntry *TE) {
22255 auto It = MinBWs.find(TE);
22256 return It != MinBWs.end() &&
22257 It->second.first > UserTESz;
22258 }))
22259 return true;
22260 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22261 }));
22262 })) {
22263 ToDemote.push_back(E.Idx);
22264 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22265 auto It = MinBWs.find(UserTE);
22266 if (It != MinBWs.end())
22267 return It->second.first;
22268 unsigned MaxBitWidth =
22269 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22270 MaxBitWidth = bit_ceil(MaxBitWidth);
22271 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22272 MaxBitWidth = 8;
22273 return MaxBitWidth;
22274 }
22275
22276 if (!E.hasState())
22277 return 0u;
22278
22279 unsigned VF = E.getVectorFactor();
22280 Type *ScalarTy = E.Scalars.front()->getType();
22281 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22282 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
22283 if (!TreeRootIT)
22284 return 0u;
22285
22286 if (any_of(E.Scalars,
22287 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
22288 return 0u;
22289
22290 unsigned NumParts = ::getNumberOfParts(
22291 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
22292
22293 // The maximum bit width required to represent all the values that can be
22294 // demoted without loss of precision. It would be safe to truncate the roots
22295 // of the expression to this width.
22296 unsigned MaxBitWidth = 1u;
22297
22298 // True if the roots can be zero-extended back to their original type,
22299 // rather than sign-extended. We know that if the leading bits are not
22300 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
22301 // True.
22302 // Determine if the sign bit of all the roots is known to be zero. If not,
22303 // IsKnownPositive is set to False.
22304 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
22305 if (isa<PoisonValue>(R))
22306 return true;
22307 KnownBits Known = computeKnownBits(R, *DL);
22308 return Known.isNonNegative();
22309 });
22310
22311 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22312 E.UserTreeIndex.UserTE->hasState() &&
22313 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22314 MaxBitWidth =
22315 std::min(DL->getTypeSizeInBits(
22316 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22317 DL->getTypeSizeInBits(ScalarTy));
22318
22319 // We first check if all the bits of the roots are demanded. If they're not,
22320 // we can truncate the roots to this narrower type.
22321 for (Value *Root : E.Scalars) {
22322 if (isa<PoisonValue>(Root))
22323 continue;
22324 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
22325 TypeSize NumTypeBits =
22326 DL->getTypeSizeInBits(Root->getType()->getScalarType());
22327 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22328 // If we can't prove that the sign bit is zero, we must add one to the
22329 // maximum bit width to account for the unknown sign bit. This preserves
22330 // the existing sign bit so we can safely sign-extend the root back to the
22331 // original type. Otherwise, if we know the sign bit is zero, we will
22332 // zero-extend the root instead.
22333 //
22334 // FIXME: This is somewhat suboptimal, as there will be cases where adding
22335 // one to the maximum bit width will yield a larger-than-necessary
22336 // type. In general, we need to add an extra bit only if we can't
22337 // prove that the upper bit of the original type is equal to the
22338 // upper bit of the proposed smaller type. If these two bits are
22339 // the same (either zero or one) we know that sign-extending from
22340 // the smaller type will result in the same value. Here, since we
22341 // can't yet prove this, we are just making the proposed smaller
22342 // type larger to ensure correctness.
22343 if (!IsKnownPositive)
22344 ++BitWidth1;
22345
22346 auto *I = dyn_cast<Instruction>(Root);
22347 if (!I) {
22348 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22349 continue;
22350 }
22351 APInt Mask = DB->getDemandedBits(I);
22352 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22353 MaxBitWidth =
22354 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22355 }
22356
22357 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22358 MaxBitWidth = 8;
22359
22360 // If the original type is large, but reduced type does not improve the reg
22361 // use - ignore it.
22362 if (NumParts > 1 &&
22363 NumParts ==
22365 *TTI, getWidenedType(IntegerType::get(F->getContext(),
22366 bit_ceil(MaxBitWidth)),
22367 VF)))
22368 return 0u;
22369
22370 unsigned Opcode = E.getOpcode();
22371 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22372 Opcode == Instruction::SExt ||
22373 Opcode == Instruction::ZExt || NumParts > 1;
22374 // Conservatively determine if we can actually truncate the roots of the
22375 // expression. Collect the values that can be demoted in ToDemote and
22376 // additional roots that require investigating in Roots.
22378 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22379 bool NeedToDemote = IsProfitableToDemote;
22380
22381 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22382 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22383 NeedToDemote, IsTruncRoot) ||
22384 (MaxDepthLevel <= Limit &&
22385 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22386 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22387 DL->getTypeSizeInBits(TreeRootIT) /
22388 DL->getTypeSizeInBits(
22389 E.getMainOp()->getOperand(0)->getType()) >
22390 2)))))
22391 return 0u;
22392 // Round MaxBitWidth up to the next power-of-two.
22393 MaxBitWidth = bit_ceil(MaxBitWidth);
22394
22395 return MaxBitWidth;
22396 };
22397
22398 // If we can truncate the root, we must collect additional values that might
22399 // be demoted as a result. That is, those seeded by truncations we will
22400 // modify.
22401 // Add reduction ops sizes, if any.
22402 if (UserIgnoreList &&
22403 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22404 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
22405 // x i1> to in)).
22406 if (all_of(*UserIgnoreList,
22407 [](Value *V) {
22408 return isa<PoisonValue>(V) ||
22409 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22410 }) &&
22411 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22412 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22413 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22414 Builder.getInt1Ty()) {
22415 ReductionBitWidth = 1;
22416 } else {
22417 for (Value *V : *UserIgnoreList) {
22418 if (isa<PoisonValue>(V))
22419 continue;
22420 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22421 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
22422 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22424 ++BitWidth1;
22425 unsigned BitWidth2 = BitWidth1;
22428 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22429 }
22430 ReductionBitWidth =
22431 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22432 }
22433 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22434 ReductionBitWidth = 8;
22435
22436 ReductionBitWidth = bit_ceil(ReductionBitWidth);
22437 }
22438 }
22439 bool IsTopRoot = NodeIdx == 0;
22440 while (NodeIdx < VectorizableTree.size() &&
22441 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22442 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22443 RootDemotes.push_back(NodeIdx);
22444 ++NodeIdx;
22445 IsTruncRoot = true;
22446 }
22447 bool IsSignedCmp = false;
22448 if (UserIgnoreList &&
22449 all_of(*UserIgnoreList,
22451 m_SMax(m_Value(), m_Value())))))
22452 IsSignedCmp = true;
22453 while (NodeIdx < VectorizableTree.size()) {
22454 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
22455 unsigned Limit = 2;
22456 if (IsTopRoot &&
22457 ReductionBitWidth ==
22458 DL->getTypeSizeInBits(
22459 VectorizableTree.front()->Scalars.front()->getType()))
22460 Limit = 3;
22461 unsigned MaxBitWidth = ComputeMaxBitWidth(
22462 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22463 IsTruncRoot, IsSignedCmp);
22464 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22465 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22466 ReductionBitWidth = bit_ceil(MaxBitWidth);
22467 else if (MaxBitWidth == 0)
22468 ReductionBitWidth = 0;
22469 }
22470
22471 for (unsigned Idx : RootDemotes) {
22472 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
22473 uint32_t OrigBitWidth =
22474 DL->getTypeSizeInBits(V->getType()->getScalarType());
22475 if (OrigBitWidth > MaxBitWidth) {
22476 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
22477 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22478 }
22479 return false;
22480 }))
22481 ToDemote.push_back(Idx);
22482 }
22483 RootDemotes.clear();
22484 IsTopRoot = false;
22485 IsProfitableToDemoteRoot = true;
22486
22487 if (ExtraBitWidthNodes.empty()) {
22488 NodeIdx = VectorizableTree.size();
22489 } else {
22490 unsigned NewIdx = 0;
22491 do {
22492 NewIdx = *ExtraBitWidthNodes.begin();
22493 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22494 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22495 NodeIdx = NewIdx;
22496 IsTruncRoot =
22497 NodeIdx < VectorizableTree.size() &&
22498 VectorizableTree[NodeIdx]->UserTreeIndex &&
22499 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22500 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22501 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22502 Instruction::Trunc &&
22503 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22504 IsSignedCmp =
22505 NodeIdx < VectorizableTree.size() &&
22506 VectorizableTree[NodeIdx]->UserTreeIndex &&
22507 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22508 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22509 Instruction::ICmp &&
22510 any_of(
22511 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22512 [&](Value *V) {
22513 auto *IC = dyn_cast<ICmpInst>(V);
22514 return IC && (IC->isSigned() ||
22515 !isKnownNonNegative(IC->getOperand(0),
22516 SimplifyQuery(*DL)) ||
22517 !isKnownNonNegative(IC->getOperand(1),
22518 SimplifyQuery(*DL)));
22519 });
22520 }
22521
22522 // If the maximum bit width we compute is less than the width of the roots'
22523 // type, we can proceed with the narrowing. Otherwise, do nothing.
22524 if (MaxBitWidth == 0 ||
22525 MaxBitWidth >=
22526 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
22527 ->getBitWidth()) {
22528 if (UserIgnoreList)
22529 AnalyzedMinBWVals.insert_range(TreeRoot);
22530 NodesToKeepBWs.insert_range(ToDemote);
22531 continue;
22532 }
22533
22534 // Finally, map the values we can demote to the maximum bit with we
22535 // computed.
22536 for (unsigned Idx : ToDemote) {
22537 TreeEntry *TE = VectorizableTree[Idx].get();
22538 if (MinBWs.contains(TE))
22539 continue;
22540 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
22541 if (isa<PoisonValue>(R))
22542 return false;
22543 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22544 });
22545 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22546 }
22547 }
22548}
22549
22551 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
22552 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
22554 auto *AA = &AM.getResult<AAManager>(F);
22555 auto *LI = &AM.getResult<LoopAnalysis>(F);
22556 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
22557 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
22558 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
22560
22561 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
22562 if (!Changed)
22563 return PreservedAnalyses::all();
22564
22567 return PA;
22568}
22569
22571 TargetTransformInfo *TTI_,
22572 TargetLibraryInfo *TLI_, AAResults *AA_,
22573 LoopInfo *LI_, DominatorTree *DT_,
22574 AssumptionCache *AC_, DemandedBits *DB_,
22577 return false;
22578 SE = SE_;
22579 TTI = TTI_;
22580 TLI = TLI_;
22581 AA = AA_;
22582 LI = LI_;
22583 DT = DT_;
22584 AC = AC_;
22585 DB = DB_;
22586 DL = &F.getDataLayout();
22587
22588 Stores.clear();
22589 GEPs.clear();
22590 bool Changed = false;
22591
22592 // If the target claims to have no vector registers don't attempt
22593 // vectorization.
22594 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
22595 LLVM_DEBUG(
22596 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
22597 return false;
22598 }
22599
22600 // Don't vectorize when the attribute NoImplicitFloat is used.
22601 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
22602 return false;
22603
22604 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
22605
22606 // Use the bottom up slp vectorizer to construct chains that start with
22607 // store instructions.
22608 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
22609
22610 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
22611 // delete instructions.
22612
22613 // Update DFS numbers now so that we can use them for ordering.
22614 DT->updateDFSNumbers();
22615
22616 // Scan the blocks in the function in post order.
22617 for (auto *BB : post_order(&F.getEntryBlock())) {
22619 continue;
22620
22621 // Start new block - clear the list of reduction roots.
22622 R.clearReductionData();
22623 collectSeedInstructions(BB);
22624
22625 // Vectorize trees that end at stores.
22626 if (!Stores.empty()) {
22627 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
22628 << " underlying objects.\n");
22629 Changed |= vectorizeStoreChains(R);
22630 }
22631
22632 // Vectorize trees that end at reductions.
22633 Changed |= vectorizeChainsInBlock(BB, R);
22634
22635 // Vectorize the index computations of getelementptr instructions. This
22636 // is primarily intended to catch gather-like idioms ending at
22637 // non-consecutive loads.
22638 if (!GEPs.empty()) {
22639 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
22640 << " underlying objects.\n");
22641 Changed |= vectorizeGEPIndices(BB, R);
22642 }
22643 }
22644
22645 if (Changed) {
22646 R.optimizeGatherSequence();
22647 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
22648 }
22649 return Changed;
22650}
22651
22652std::optional<bool>
22653SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
22654 unsigned Idx, unsigned MinVF,
22655 unsigned &Size) {
22656 Size = 0;
22657 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
22658 << "\n");
22659 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22660 unsigned VF = Chain.size();
22661
22662 if (!has_single_bit(Sz) ||
22664 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
22665 VF) ||
22666 VF < 2 || VF < MinVF) {
22667 // Check if vectorizing with a non-power-of-2 VF should be considered. At
22668 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
22669 // all vector lanes are used.
22670 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
22671 return false;
22672 }
22673
22674 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
22675 << "\n");
22676
22677 SetVector<Value *> ValOps;
22678 for (Value *V : Chain)
22679 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
22680 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
22681 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
22682 InstructionsState S = Analysis.buildInstructionsState(
22683 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
22684 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
22685 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
22686 bool IsAllowedSize =
22687 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
22688 ValOps.size()) ||
22689 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
22690 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22691 (!S.getMainOp()->isSafeToRemove() ||
22692 any_of(ValOps.getArrayRef(),
22693 [&](Value *V) {
22694 return !isa<ExtractElementInst>(V) &&
22695 (V->getNumUses() > Chain.size() ||
22696 any_of(V->users(), [&](User *U) {
22697 return !Stores.contains(U);
22698 }));
22699 }))) ||
22700 (ValOps.size() > Chain.size() / 2 && !S)) {
22701 Size = (!IsAllowedSize && S) ? 1 : 2;
22702 return false;
22703 }
22704 }
22705 if (R.isLoadCombineCandidate(Chain))
22706 return true;
22707 R.buildTree(Chain);
22708 // Check if tree tiny and store itself or its value is not vectorized.
22709 if (R.isTreeTinyAndNotFullyVectorizable()) {
22710 if (R.isGathered(Chain.front()) ||
22711 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
22712 return std::nullopt;
22713 Size = R.getCanonicalGraphSize();
22714 return false;
22715 }
22716 if (R.isProfitableToReorder()) {
22717 R.reorderTopToBottom();
22718 R.reorderBottomToTop();
22719 }
22720 R.transformNodes();
22721 R.buildExternalUses();
22722
22723 R.computeMinimumValueSizes();
22724
22725 Size = R.getCanonicalGraphSize();
22726 if (S && S.getOpcode() == Instruction::Load)
22727 Size = 2; // cut off masked gather small trees
22728 InstructionCost Cost = R.getTreeCost();
22729
22730 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
22731 if (Cost < -SLPCostThreshold) {
22732 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
22733
22734 using namespace ore;
22735
22736 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
22737 cast<StoreInst>(Chain[0]))
22738 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
22739 << " and with tree size "
22740 << NV("TreeSize", R.getTreeSize()));
22741
22742 R.vectorizeTree();
22743 return true;
22744 }
22745
22746 return false;
22747}
22748
22749/// Checks if the quadratic mean deviation is less than 90% of the mean size.
22750static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
22751 bool First) {
22752 unsigned Num = 0;
22753 uint64_t Sum = std::accumulate(
22754 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22755 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22756 unsigned Size = First ? Val.first : Val.second;
22757 if (Size == 1)
22758 return V;
22759 ++Num;
22760 return V + Size;
22761 });
22762 if (Num == 0)
22763 return true;
22764 uint64_t Mean = Sum / Num;
22765 if (Mean == 0)
22766 return true;
22767 uint64_t Dev = std::accumulate(
22768 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22769 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22770 unsigned P = First ? Val.first : Val.second;
22771 if (P == 1)
22772 return V;
22773 return V + (P - Mean) * (P - Mean);
22774 }) /
22775 Num;
22776 return Dev * 96 / (Mean * Mean) == 0;
22777}
22778
22779namespace {
22780
22781/// A group of stores that we'll try to bundle together using vector ops.
22782/// They are ordered using the signed distance of their address operand to the
22783/// address of this group's BaseInstr.
22784class RelatedStoreInsts {
22785public:
22786 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
22787 : AllStores(AllStores) {
22788 reset(BaseInstrIdx);
22789 }
22790
22791 void reset(unsigned NewBaseInstr) {
22792 assert(NewBaseInstr < AllStores.size() &&
22793 "Instruction index out of bounds");
22794 BaseInstrIdx = NewBaseInstr;
22795 Instrs.clear();
22796 insertOrLookup(NewBaseInstr, 0);
22797 }
22798
22799 /// Tries to insert \p InstrIdx as the store with a pointer distance of
22800 /// \p PtrDist.
22801 /// Does nothing if there is already a store with that \p PtrDist.
22802 /// \returns The previously associated Instruction index, or std::nullopt
22803 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
22804 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22805 return Inserted ? std::nullopt : std::make_optional(It->second);
22806 }
22807
22808 using DistToInstMap = std::map<int64_t, unsigned>;
22809 const DistToInstMap &getStores() const { return Instrs; }
22810
22811 /// If \p SI is related to this group of stores, return the distance of its
22812 /// pointer operand to the one the group's BaseInstr.
22813 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
22814 ScalarEvolution &SE) const {
22815 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22816 return getPointersDiff(
22817 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
22818 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
22819 /*StrictCheck=*/true);
22820 }
22821
22822 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
22823 /// Stores whose index is less than \p MinSafeIdx will be dropped.
22824 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
22825 int64_t DistFromCurBase) {
22826 DistToInstMap PrevSet = std::move(Instrs);
22827 reset(NewBaseInstIdx);
22828
22829 // Re-insert stores that come after MinSafeIdx to try and vectorize them
22830 // again. Their distance will be "rebased" to use NewBaseInstIdx as
22831 // reference.
22832 for (auto [Dist, InstIdx] : PrevSet) {
22833 if (InstIdx >= MinSafeIdx)
22834 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22835 }
22836 }
22837
22838 /// Remove all stores that have been vectorized from this group.
22839 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
22840 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
22841 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
22842 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
22843 });
22844
22845 // Get a forward iterator pointing after the last vectorized store and erase
22846 // all stores before it so we don't try to vectorize them again.
22847 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22848 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22849 }
22850
22851private:
22852 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
22853 unsigned BaseInstrIdx;
22854
22855 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
22856 DistToInstMap Instrs;
22857
22858 /// Reference to all the stores in the BB being analyzed.
22859 ArrayRef<StoreInst *> AllStores;
22860};
22861
22862} // end anonymous namespace
22863
22864bool SLPVectorizerPass::vectorizeStores(
22865 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
22866 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
22867 &Visited) {
22868 // We may run into multiple chains that merge into a single chain. We mark the
22869 // stores that we vectorized so that we don't visit the same store twice.
22870 BoUpSLP::ValueSet VectorizedStores;
22871 bool Changed = false;
22872
22873 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
22874 int64_t PrevDist = -1;
22876 // Collect the chain into a list.
22877 for (auto [Idx, Data] : enumerate(StoreSeq)) {
22878 auto &[Dist, InstIdx] = Data;
22879 if (Operands.empty() || Dist - PrevDist == 1) {
22880 Operands.push_back(Stores[InstIdx]);
22881 PrevDist = Dist;
22882 if (Idx != StoreSeq.size() - 1)
22883 continue;
22884 }
22885 auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
22886 Operands.clear();
22887 Operands.push_back(Stores[InstIdx]);
22888 PrevDist = Dist;
22889 });
22890
22891 if (Operands.size() <= 1 ||
22892 !Visited
22893 .insert({Operands.front(),
22894 cast<StoreInst>(Operands.front())->getValueOperand(),
22895 Operands.back(),
22896 cast<StoreInst>(Operands.back())->getValueOperand(),
22897 Operands.size()})
22898 .second)
22899 continue;
22900
22901 unsigned MaxVecRegSize = R.getMaxVecRegSize();
22902 unsigned EltSize = R.getVectorElementSize(Operands[0]);
22903 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
22904
22905 unsigned MaxVF =
22906 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
22907 auto *Store = cast<StoreInst>(Operands[0]);
22908 Type *StoreTy = Store->getValueOperand()->getType();
22909 Type *ValueTy = StoreTy;
22910 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
22911 ValueTy = Trunc->getSrcTy();
22912 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
22913 // getStoreMinimumVF only support scalar type as arguments. As a result,
22914 // we need to use the element type of StoreTy and ValueTy to retrieve the
22915 // VF and then transform it back.
22916 // Remember: VF is defined as the number we want to vectorize, not the
22917 // number of elements in the final vector.
22918 Type *StoreScalarTy = StoreTy->getScalarType();
22919 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
22920 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
22921 ValueTy->getScalarType()));
22922 MinVF /= getNumElements(StoreTy);
22923 MinVF = std::max<unsigned>(2, MinVF);
22924
22925 if (MaxVF < MinVF) {
22926 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
22927 << ") < "
22928 << "MinVF (" << MinVF << ")\n");
22929 continue;
22930 }
22931
22932 unsigned NonPowerOf2VF = 0;
22934 // First try vectorizing with a non-power-of-2 VF. At the moment, only
22935 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
22936 // lanes are used.
22937 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
22938 if (has_single_bit(CandVF + 1)) {
22939 NonPowerOf2VF = CandVF;
22940 assert(NonPowerOf2VF != MaxVF &&
22941 "Non-power-of-2 VF should not be equal to MaxVF");
22942 }
22943 }
22944
22945 // MaxRegVF represents the number of instructions (scalar, or vector in
22946 // case of revec) that can be vectorized to naturally fit in a vector
22947 // register.
22948 unsigned MaxRegVF = MaxVF;
22949
22950 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
22951 if (MaxVF < MinVF) {
22952 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
22953 << ") < "
22954 << "MinVF (" << MinVF << ")\n");
22955 continue;
22956 }
22957
22958 SmallVector<unsigned> CandidateVFs;
22959 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
22960 VF = divideCeil(VF, 2))
22961 CandidateVFs.push_back(VF);
22962
22963 unsigned End = Operands.size();
22964 unsigned Repeat = 0;
22965 constexpr unsigned MaxAttempts = 4;
22966 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
22967 for (std::pair<unsigned, unsigned> &P : RangeSizes)
22968 P.first = P.second = 1;
22969 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
22970 auto IsNotVectorized = [](bool First,
22971 const std::pair<unsigned, unsigned> &P) {
22972 return First ? P.first > 0 : P.second > 0;
22973 };
22974 auto IsVectorized = [](bool First,
22975 const std::pair<unsigned, unsigned> &P) {
22976 return First ? P.first == 0 : P.second == 0;
22977 };
22978 auto VFIsProfitable = [](bool First, unsigned Size,
22979 const std::pair<unsigned, unsigned> &P) {
22980 return First ? Size >= P.first : Size >= P.second;
22981 };
22982 auto FirstSizeSame = [](unsigned Size,
22983 const std::pair<unsigned, unsigned> &P) {
22984 return Size == P.first;
22985 };
22986 while (true) {
22987 ++Repeat;
22988 bool RepeatChanged = false;
22989 bool AnyProfitableGraph = false;
22990 for (unsigned VF : CandidateVFs) {
22991 AnyProfitableGraph = false;
22992 unsigned FirstUnvecStore =
22993 std::distance(RangeSizes.begin(),
22994 find_if(RangeSizes, std::bind(IsNotVectorized,
22995 VF >= MaxRegVF, _1)));
22996
22997 // Form slices of size VF starting from FirstUnvecStore and try to
22998 // vectorize them.
22999 while (FirstUnvecStore < End) {
23000 unsigned FirstVecStore = std::distance(
23001 RangeSizes.begin(),
23002 find_if(RangeSizes.drop_front(FirstUnvecStore),
23003 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23004 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23005 for (unsigned SliceStartIdx = FirstUnvecStore;
23006 SliceStartIdx + VF <= MaxSliceEnd;) {
23007 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
23008 VF >= MaxRegVF)) {
23009 ++SliceStartIdx;
23010 continue;
23011 }
23012 ArrayRef<Value *> Slice =
23013 ArrayRef(Operands).slice(SliceStartIdx, VF);
23014 assert(all_of(Slice,
23015 [&](Value *V) {
23016 return cast<StoreInst>(V)
23017 ->getValueOperand()
23018 ->getType() ==
23019 cast<StoreInst>(Slice.front())
23020 ->getValueOperand()
23021 ->getType();
23022 }) &&
23023 "Expected all operands of same type.");
23024 if (!NonSchedulable.empty()) {
23025 auto [NonSchedSizeMax, NonSchedSizeMin] =
23026 NonSchedulable.lookup(Slice.front());
23027 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23028 // VF is too ambitious. Try to vectorize another slice before
23029 // trying a smaller VF.
23030 SliceStartIdx += NonSchedSizeMax;
23031 continue;
23032 }
23033 }
23034 unsigned TreeSize;
23035 std::optional<bool> Res =
23036 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23037 if (!Res) {
23038 // Update the range of non schedulable VFs for slices starting
23039 // at SliceStartIdx.
23040 NonSchedulable
23041 .try_emplace(Slice.front(), std::make_pair(VF, VF))
23042 .first->getSecond()
23043 .second = VF;
23044 } else if (*Res) {
23045 // Mark the vectorized stores so that we don't vectorize them
23046 // again.
23047 VectorizedStores.insert_range(Slice);
23048 // Mark the vectorized stores so that we don't vectorize them
23049 // again.
23050 AnyProfitableGraph = RepeatChanged = Changed = true;
23051 // If we vectorized initial block, no need to try to vectorize
23052 // it again.
23053 for (std::pair<unsigned, unsigned> &P :
23054 RangeSizes.slice(SliceStartIdx, VF))
23055 P.first = P.second = 0;
23056 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23057 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
23058 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23059 P.first = P.second = 0;
23060 FirstUnvecStore = SliceStartIdx + VF;
23061 }
23062 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23063 for (std::pair<unsigned, unsigned> &P :
23064 RangeSizes.slice(SliceStartIdx + VF,
23065 MaxSliceEnd - (SliceStartIdx + VF)))
23066 P.first = P.second = 0;
23067 if (MaxSliceEnd == End)
23068 End = SliceStartIdx;
23069 MaxSliceEnd = SliceStartIdx;
23070 }
23071 SliceStartIdx += VF;
23072 continue;
23073 }
23074 if (VF > 2 && Res &&
23075 !all_of(RangeSizes.slice(SliceStartIdx, VF),
23076 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23077 _1))) {
23078 SliceStartIdx += VF;
23079 continue;
23080 }
23081 // Check for the very big VFs that we're not rebuilding same
23082 // trees, just with larger number of elements.
23083 if (VF > MaxRegVF && TreeSize > 1 &&
23084 all_of(RangeSizes.slice(SliceStartIdx, VF),
23085 std::bind(FirstSizeSame, TreeSize, _1))) {
23086 SliceStartIdx += VF;
23087 while (SliceStartIdx != MaxSliceEnd &&
23088 RangeSizes[SliceStartIdx].first == TreeSize)
23089 ++SliceStartIdx;
23090 continue;
23091 }
23092 if (TreeSize > 1) {
23093 for (std::pair<unsigned, unsigned> &P :
23094 RangeSizes.slice(SliceStartIdx, VF)) {
23095 if (VF >= MaxRegVF)
23096 P.second = std::max(P.second, TreeSize);
23097 else
23098 P.first = std::max(P.first, TreeSize);
23099 }
23100 }
23101 ++SliceStartIdx;
23102 AnyProfitableGraph = true;
23103 }
23104 if (FirstUnvecStore >= End)
23105 break;
23106 if (MaxSliceEnd - FirstUnvecStore < VF &&
23107 MaxSliceEnd - FirstUnvecStore >= MinVF)
23108 AnyProfitableGraph = true;
23109 FirstUnvecStore = std::distance(
23110 RangeSizes.begin(),
23111 find_if(RangeSizes.drop_front(MaxSliceEnd),
23112 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23113 }
23114 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
23115 break;
23116 }
23117 // All values vectorized - exit.
23118 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
23119 return P.first == 0 && P.second == 0;
23120 }))
23121 break;
23122 // Check if tried all attempts or no need for the last attempts at all.
23123 if (Repeat >= MaxAttempts ||
23124 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23125 break;
23126 constexpr unsigned StoresLimit = 64;
23127 const unsigned MaxTotalNum = std::min<unsigned>(
23128 Operands.size(),
23129 static_cast<unsigned>(
23130 End -
23131 std::distance(
23132 RangeSizes.begin(),
23133 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
23134 1));
23135 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
23136 unsigned Limit =
23137 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
23138 CandidateVFs.clear();
23139 if (bit_floor(Limit) == VF)
23140 CandidateVFs.push_back(Limit);
23141 if (VF > MaxTotalNum || VF >= StoresLimit)
23142 break;
23143 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
23144 if (P.first != 0)
23145 P.first = std::max(P.second, P.first);
23146 }
23147 // Last attempt to vectorize max number of elements, if all previous
23148 // attempts were unsuccessful because of the cost issues.
23149 CandidateVFs.push_back(VF);
23150 }
23151 }
23152 };
23153
23154 /// Groups of stores to vectorize
23155 SmallVector<RelatedStoreInsts> SortedStores;
23156
23157 // Inserts the specified store SI with the given index Idx to the set of the
23158 // stores. If the store with the same distance is found already - stop
23159 // insertion, try to vectorize already found stores. If some stores from this
23160 // sequence were not vectorized - try to vectorize them with the new store
23161 // later. But this logic is applied only to the stores, that come before the
23162 // previous store with the same distance.
23163 // Example:
23164 // 1. store x, %p
23165 // 2. store y, %p+1
23166 // 3. store z, %p+2
23167 // 4. store a, %p
23168 // 5. store b, %p+3
23169 // - Scan this from the last to first store. The very first bunch of stores is
23170 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
23171 // vector).
23172 // - The next store in the list - #1 - has the same distance from store #5 as
23173 // the store #4.
23174 // - Try to vectorize sequence of stores 4,2,3,5.
23175 // - If all these stores are vectorized - just drop them.
23176 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
23177 // - Start new stores sequence.
23178 // The new bunch of stores is {1, {1, 0}}.
23179 // - Add the stores from previous sequence, that were not vectorized.
23180 // Here we consider the stores in the reversed order, rather they are used in
23181 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
23182 // Store #3 can be added -> comes after store #4 with the same distance as
23183 // store #1.
23184 // Store #5 cannot be added - comes before store #4.
23185 // This logic allows to improve the compile time, we assume that the stores
23186 // after previous store with the same distance most likely have memory
23187 // dependencies and no need to waste compile time to try to vectorize them.
23188 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
23189 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
23190 std::optional<int64_t> PtrDist;
23191 auto *RelatedStores = find_if(
23192 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
23193 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23194 return PtrDist.has_value();
23195 });
23196
23197 // We did not find a comparable store, start a new group.
23198 if (RelatedStores == SortedStores.end()) {
23199 SortedStores.emplace_back(Idx, Stores);
23200 return;
23201 }
23202
23203 // If there is already a store in the group with the same PtrDiff, try to
23204 // vectorize the existing instructions before adding the current store.
23205 // Otherwise, insert this store and keep collecting.
23206 if (std::optional<unsigned> PrevInst =
23207 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23208 TryToVectorize(RelatedStores->getStores());
23209 RelatedStores->clearVectorizedStores(VectorizedStores);
23210 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
23211 /*NewBaseInstIdx=*/Idx,
23212 /*DistFromCurBase=*/*PtrDist);
23213 }
23214 };
23215 Type *PrevValTy = nullptr;
23216 for (auto [I, SI] : enumerate(Stores)) {
23217 if (R.isDeleted(SI))
23218 continue;
23219 if (!PrevValTy)
23220 PrevValTy = SI->getValueOperand()->getType();
23221 // Check that we do not try to vectorize stores of different types.
23222 if (PrevValTy != SI->getValueOperand()->getType()) {
23223 for (RelatedStoreInsts &StoreSeq : SortedStores)
23224 TryToVectorize(StoreSeq.getStores());
23225 SortedStores.clear();
23226 PrevValTy = SI->getValueOperand()->getType();
23227 }
23228 FillStoresSet(I, SI);
23229 }
23230
23231 // Final vectorization attempt.
23232 for (RelatedStoreInsts &StoreSeq : SortedStores)
23233 TryToVectorize(StoreSeq.getStores());
23234
23235 return Changed;
23236}
23237
23238void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23239 // Initialize the collections. We will make a single pass over the block.
23240 Stores.clear();
23241 GEPs.clear();
23242
23243 // Visit the store and getelementptr instructions in BB and organize them in
23244 // Stores and GEPs according to the underlying objects of their pointer
23245 // operands.
23246 for (Instruction &I : *BB) {
23247 // Ignore store instructions that are volatile or have a pointer operand
23248 // that doesn't point to a scalar type.
23249 if (auto *SI = dyn_cast<StoreInst>(&I)) {
23250 if (!SI->isSimple())
23251 continue;
23252 if (!isValidElementType(SI->getValueOperand()->getType()))
23253 continue;
23254 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
23255 }
23256
23257 // Ignore getelementptr instructions that have more than one index, a
23258 // constant index, or a pointer operand that doesn't point to a scalar
23259 // type.
23260 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
23261 if (GEP->getNumIndices() != 1)
23262 continue;
23263 Value *Idx = GEP->idx_begin()->get();
23264 if (isa<Constant>(Idx))
23265 continue;
23266 if (!isValidElementType(Idx->getType()))
23267 continue;
23268 if (GEP->getType()->isVectorTy())
23269 continue;
23270 GEPs[GEP->getPointerOperand()].push_back(GEP);
23271 }
23272 }
23273}
23274
23275bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
23276 bool MaxVFOnly) {
23277 if (VL.size() < 2)
23278 return false;
23279
23280 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
23281 << VL.size() << ".\n");
23282
23283 // Check that all of the parts are instructions of the same type,
23284 // we permit an alternate opcode via InstructionsState.
23285 InstructionsState S = getSameOpcode(VL, *TLI);
23286 if (!S)
23287 return false;
23288
23289 Instruction *I0 = S.getMainOp();
23290 // Make sure invalid types (including vector type) are rejected before
23291 // determining vectorization factor for scalar instructions.
23292 for (Value *V : VL) {
23293 Type *Ty = V->getType();
23295 // NOTE: the following will give user internal llvm type name, which may
23296 // not be useful.
23297 R.getORE()->emit([&]() {
23298 std::string TypeStr;
23299 llvm::raw_string_ostream OS(TypeStr);
23300 Ty->print(OS);
23301 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
23302 << "Cannot SLP vectorize list: type "
23303 << TypeStr + " is unsupported by vectorizer";
23304 });
23305 return false;
23306 }
23307 }
23308
23309 Type *ScalarTy = getValueType(VL[0]);
23310 unsigned Sz = R.getVectorElementSize(I0);
23311 unsigned MinVF = R.getMinVF(Sz);
23312 unsigned MaxVF = std::max<unsigned>(
23313 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
23314 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23315 if (MaxVF < 2) {
23316 R.getORE()->emit([&]() {
23317 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
23318 << "Cannot SLP vectorize list: vectorization factor "
23319 << "less than 2 is not supported";
23320 });
23321 return false;
23322 }
23323
23324 bool Changed = false;
23325 bool CandidateFound = false;
23326 InstructionCost MinCost = SLPCostThreshold.getValue();
23327
23328 unsigned NextInst = 0, MaxInst = VL.size();
23329 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23330 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
23331 // No actual vectorization should happen, if number of parts is the same as
23332 // provided vectorization factor (i.e. the scalar type is used for vector
23333 // code during codegen).
23334 auto *VecTy = getWidenedType(ScalarTy, VF);
23335 if (TTI->getNumberOfParts(VecTy) == VF)
23336 continue;
23337 for (unsigned I = NextInst; I < MaxInst; ++I) {
23338 unsigned ActualVF = std::min(MaxInst - I, VF);
23339
23340 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
23341 continue;
23342
23343 if (MaxVFOnly && ActualVF < MaxVF)
23344 break;
23345 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23346 break;
23347
23348 SmallVector<Value *> Ops(ActualVF, nullptr);
23349 unsigned Idx = 0;
23350 for (Value *V : VL.drop_front(I)) {
23351 // Check that a previous iteration of this loop did not delete the
23352 // Value.
23353 if (auto *Inst = dyn_cast<Instruction>(V);
23354 !Inst || !R.isDeleted(Inst)) {
23355 Ops[Idx] = V;
23356 ++Idx;
23357 if (Idx == ActualVF)
23358 break;
23359 }
23360 }
23361 // Not enough vectorizable instructions - exit.
23362 if (Idx != ActualVF)
23363 break;
23364
23365 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
23366 << "\n");
23367
23368 R.buildTree(Ops);
23369 if (R.isTreeTinyAndNotFullyVectorizable())
23370 continue;
23371 if (R.isProfitableToReorder()) {
23372 R.reorderTopToBottom();
23373 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
23374 }
23375 R.transformNodes();
23376 R.buildExternalUses();
23377
23378 R.computeMinimumValueSizes();
23379 InstructionCost Cost = R.getTreeCost();
23380 CandidateFound = true;
23381 MinCost = std::min(MinCost, Cost);
23382
23383 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
23384 << " for VF=" << ActualVF << "\n");
23385 if (Cost < -SLPCostThreshold) {
23386 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
23387 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
23389 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
23390 << " and with tree size "
23391 << ore::NV("TreeSize", R.getTreeSize()));
23392
23393 R.vectorizeTree();
23394 // Move to the next bundle.
23395 I += VF - 1;
23396 NextInst = I + 1;
23397 Changed = true;
23398 }
23399 }
23400 }
23401
23402 if (!Changed && CandidateFound) {
23403 R.getORE()->emit([&]() {
23404 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
23405 << "List vectorization was possible but not beneficial with cost "
23406 << ore::NV("Cost", MinCost) << " >= "
23407 << ore::NV("Treshold", -SLPCostThreshold);
23408 });
23409 } else if (!Changed) {
23410 R.getORE()->emit([&]() {
23411 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
23412 << "Cannot SLP vectorize list: vectorization was impossible"
23413 << " with available vectorization factors";
23414 });
23415 }
23416 return Changed;
23417}
23418
23419namespace {
23420
23421/// Model horizontal reductions.
23422///
23423/// A horizontal reduction is a tree of reduction instructions that has values
23424/// that can be put into a vector as its leaves. For example:
23425///
23426/// mul mul mul mul
23427/// \ / \ /
23428/// + +
23429/// \ /
23430/// +
23431/// This tree has "mul" as its leaf values and "+" as its reduction
23432/// instructions. A reduction can feed into a store or a binary operation
23433/// feeding a phi.
23434/// ...
23435/// \ /
23436/// +
23437/// |
23438/// phi +=
23439///
23440/// Or:
23441/// ...
23442/// \ /
23443/// +
23444/// |
23445/// *p =
23446///
23447class HorizontalReduction {
23448 using ReductionOpsType = SmallVector<Value *, 16>;
23449 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23450 ReductionOpsListType ReductionOps;
23451 /// List of possibly reduced values.
23453 /// Maps reduced value to the corresponding reduction operation.
23454 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23455 WeakTrackingVH ReductionRoot;
23456 /// The type of reduction operation.
23457 RecurKind RdxKind;
23458 /// Checks if the optimization of original scalar identity operations on
23459 /// matched horizontal reductions is enabled and allowed.
23460 bool IsSupportedHorRdxIdentityOp = false;
23461 /// The minimum number of the reduced values.
23462 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
23463 /// Contains vector values for reduction including their scale factor and
23464 /// signedness.
23466
23467 static bool isCmpSelMinMax(Instruction *I) {
23468 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
23470 }
23471
23472 // And/or are potentially poison-safe logical patterns like:
23473 // select x, y, false
23474 // select x, true, y
23475 static bool isBoolLogicOp(Instruction *I) {
23476 return isa<SelectInst>(I) &&
23477 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
23478 }
23479
23480 /// Checks if instruction is associative and can be vectorized.
23481 static bool isVectorizable(RecurKind Kind, Instruction *I,
23482 bool TwoElementReduction = false) {
23483 if (Kind == RecurKind::None)
23484 return false;
23485
23486 // Integer ops that map to select instructions or intrinsics are fine.
23488 isBoolLogicOp(I))
23489 return true;
23490
23491 // No need to check for associativity, if 2 reduced values.
23492 if (TwoElementReduction)
23493 return true;
23494
23495 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23496 // FP min/max are associative except for NaN and -0.0. We do not
23497 // have to rule out -0.0 here because the intrinsic semantics do not
23498 // specify a fixed result for it.
23499 return I->getFastMathFlags().noNaNs();
23500 }
23501
23502 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23503 return true;
23504
23505 return I->isAssociative();
23506 }
23507
23508 static Value *getRdxOperand(Instruction *I, unsigned Index) {
23509 // Poison-safe 'or' takes the form: select X, true, Y
23510 // To make that work with the normal operand processing, we skip the
23511 // true value operand.
23512 // TODO: Change the code and data structures to handle this without a hack.
23513 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
23514 return I->getOperand(2);
23515 return I->getOperand(Index);
23516 }
23517
23518 /// Creates reduction operation with the current opcode.
23519 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
23520 Value *RHS, const Twine &Name, bool UseSelect) {
23521 Type *OpTy = LHS->getType();
23522 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
23523 switch (Kind) {
23524 case RecurKind::Or: {
23525 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23526 return Builder.CreateSelect(
23527 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
23528 RHS, Name);
23529 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23530 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23531 Name);
23532 }
23533 case RecurKind::And: {
23534 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23535 return Builder.CreateSelect(
23536 LHS, RHS,
23537 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
23538 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23539 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23540 Name);
23541 }
23542 case RecurKind::Add:
23543 case RecurKind::Mul:
23544 case RecurKind::Xor:
23545 case RecurKind::FAdd:
23546 case RecurKind::FMul: {
23547 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23548 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23549 Name);
23550 }
23551 case RecurKind::SMax:
23552 case RecurKind::SMin:
23553 case RecurKind::UMax:
23554 case RecurKind::UMin:
23555 if (UseSelect) {
23557 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
23558 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
23559 }
23560 [[fallthrough]];
23561 case RecurKind::FMax:
23562 case RecurKind::FMin:
23563 case RecurKind::FMaximum:
23564 case RecurKind::FMinimum:
23565 case RecurKind::FMaximumNum:
23566 case RecurKind::FMinimumNum: {
23568 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
23569 }
23570 default:
23571 llvm_unreachable("Unknown reduction operation.");
23572 }
23573 }
23574
23575 /// Creates reduction operation with the current opcode with the IR flags
23576 /// from \p ReductionOps, dropping nuw/nsw flags.
23577 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
23578 Value *RHS, const Twine &Name,
23579 const ReductionOpsListType &ReductionOps) {
23580 bool UseSelect = ReductionOps.size() == 2 ||
23581 // Logical or/and.
23582 (ReductionOps.size() == 1 &&
23583 any_of(ReductionOps.front(), IsaPred<SelectInst>));
23584 assert((!UseSelect || ReductionOps.size() != 2 ||
23585 isa<SelectInst>(ReductionOps[1][0])) &&
23586 "Expected cmp + select pairs for reduction");
23587 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
23589 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
23590 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
23591 /*IncludeWrapFlags=*/false);
23592 propagateIRFlags(Op, ReductionOps[1], nullptr,
23593 /*IncludeWrapFlags=*/false);
23594 return Op;
23595 }
23596 }
23597 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
23598 return Op;
23599 }
23600
23601public:
23602 static RecurKind getRdxKind(Value *V) {
23603 auto *I = dyn_cast<Instruction>(V);
23604 if (!I)
23605 return RecurKind::None;
23606 if (match(I, m_Add(m_Value(), m_Value())))
23607 return RecurKind::Add;
23608 if (match(I, m_Mul(m_Value(), m_Value())))
23609 return RecurKind::Mul;
23610 if (match(I, m_And(m_Value(), m_Value())) ||
23612 return RecurKind::And;
23613 if (match(I, m_Or(m_Value(), m_Value())) ||
23615 return RecurKind::Or;
23616 if (match(I, m_Xor(m_Value(), m_Value())))
23617 return RecurKind::Xor;
23618 if (match(I, m_FAdd(m_Value(), m_Value())))
23619 return RecurKind::FAdd;
23620 if (match(I, m_FMul(m_Value(), m_Value())))
23621 return RecurKind::FMul;
23622
23624 return RecurKind::FMax;
23626 return RecurKind::FMin;
23627
23628 if (match(I, m_FMaximum(m_Value(), m_Value())))
23629 return RecurKind::FMaximum;
23630 if (match(I, m_FMinimum(m_Value(), m_Value())))
23631 return RecurKind::FMinimum;
23632 // This matches either cmp+select or intrinsics. SLP is expected to handle
23633 // either form.
23634 // TODO: If we are canonicalizing to intrinsics, we can remove several
23635 // special-case paths that deal with selects.
23636 if (match(I, m_SMax(m_Value(), m_Value())))
23637 return RecurKind::SMax;
23638 if (match(I, m_SMin(m_Value(), m_Value())))
23639 return RecurKind::SMin;
23640 if (match(I, m_UMax(m_Value(), m_Value())))
23641 return RecurKind::UMax;
23642 if (match(I, m_UMin(m_Value(), m_Value())))
23643 return RecurKind::UMin;
23644
23645 if (auto *Select = dyn_cast<SelectInst>(I)) {
23646 // Try harder: look for min/max pattern based on instructions producing
23647 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
23648 // During the intermediate stages of SLP, it's very common to have
23649 // pattern like this (since optimizeGatherSequence is run only once
23650 // at the end):
23651 // %1 = extractelement <2 x i32> %a, i32 0
23652 // %2 = extractelement <2 x i32> %a, i32 1
23653 // %cond = icmp sgt i32 %1, %2
23654 // %3 = extractelement <2 x i32> %a, i32 0
23655 // %4 = extractelement <2 x i32> %a, i32 1
23656 // %select = select i1 %cond, i32 %3, i32 %4
23657 CmpPredicate Pred;
23658 Instruction *L1;
23659 Instruction *L2;
23660
23661 Value *LHS = Select->getTrueValue();
23662 Value *RHS = Select->getFalseValue();
23663 Value *Cond = Select->getCondition();
23664
23665 // TODO: Support inverse predicates.
23666 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
23669 return RecurKind::None;
23670 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
23673 return RecurKind::None;
23674 } else {
23676 return RecurKind::None;
23677 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
23680 return RecurKind::None;
23681 }
23682
23683 switch (Pred) {
23684 default:
23685 return RecurKind::None;
23686 case CmpInst::ICMP_SGT:
23687 case CmpInst::ICMP_SGE:
23688 return RecurKind::SMax;
23689 case CmpInst::ICMP_SLT:
23690 case CmpInst::ICMP_SLE:
23691 return RecurKind::SMin;
23692 case CmpInst::ICMP_UGT:
23693 case CmpInst::ICMP_UGE:
23694 return RecurKind::UMax;
23695 case CmpInst::ICMP_ULT:
23696 case CmpInst::ICMP_ULE:
23697 return RecurKind::UMin;
23698 }
23699 }
23700 return RecurKind::None;
23701 }
23702
23703 /// Get the index of the first operand.
23704 static unsigned getFirstOperandIndex(Instruction *I) {
23705 return isCmpSelMinMax(I) ? 1 : 0;
23706 }
23707
23708private:
23709 /// Total number of operands in the reduction operation.
23710 static unsigned getNumberOfOperands(Instruction *I) {
23711 return isCmpSelMinMax(I) ? 3 : 2;
23712 }
23713
23714 /// Checks if the instruction is in basic block \p BB.
23715 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
23716 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
23717 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
23718 auto *Sel = cast<SelectInst>(I);
23719 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
23720 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
23721 }
23722 return I->getParent() == BB;
23723 }
23724
23725 /// Expected number of uses for reduction operations/reduced values.
23726 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
23727 if (IsCmpSelMinMax) {
23728 // SelectInst must be used twice while the condition op must have single
23729 // use only.
23730 if (auto *Sel = dyn_cast<SelectInst>(I))
23731 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23732 return I->hasNUses(2);
23733 }
23734
23735 // Arithmetic reduction operation must be used once only.
23736 return I->hasOneUse();
23737 }
23738
23739 /// Initializes the list of reduction operations.
23740 void initReductionOps(Instruction *I) {
23741 if (isCmpSelMinMax(I))
23742 ReductionOps.assign(2, ReductionOpsType());
23743 else
23744 ReductionOps.assign(1, ReductionOpsType());
23745 }
23746
23747 /// Add all reduction operations for the reduction instruction \p I.
23748 void addReductionOps(Instruction *I) {
23749 if (isCmpSelMinMax(I)) {
23750 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
23751 ReductionOps[1].emplace_back(I);
23752 } else {
23753 ReductionOps[0].emplace_back(I);
23754 }
23755 }
23756
23757 static bool isGoodForReduction(ArrayRef<Value *> Data) {
23758 int Sz = Data.size();
23759 auto *I = dyn_cast<Instruction>(Data.front());
23760 return Sz > 1 || isConstant(Data.front()) ||
23761 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
23762 }
23763
23764public:
23765 HorizontalReduction() = default;
23767 : ReductionRoot(I), ReductionLimit(2) {
23768 RdxKind = HorizontalReduction::getRdxKind(I);
23769 ReductionOps.emplace_back().push_back(I);
23770 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
23771 for (Value *V : Ops)
23772 ReducedValsToOps[V].push_back(I);
23773 }
23774
23775 bool matchReductionForOperands() const {
23776 // Analyze "regular" integer/FP types for reductions - no target-specific
23777 // types or pointers.
23778 assert(ReductionRoot && "Reduction root is not set!");
23779 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
23780 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
23781 return Ops.size() == 2;
23782 })))
23783 return false;
23784
23785 return true;
23786 }
23787
23788 /// Try to find a reduction tree.
23789 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23790 ScalarEvolution &SE, const DataLayout &DL,
23791 const TargetLibraryInfo &TLI) {
23792 RdxKind = HorizontalReduction::getRdxKind(Root);
23793 if (!isVectorizable(RdxKind, Root))
23794 return false;
23795
23796 // Analyze "regular" integer/FP types for reductions - no target-specific
23797 // types or pointers.
23798 Type *Ty = Root->getType();
23799 if (!isValidElementType(Ty) || Ty->isPointerTy())
23800 return false;
23801
23802 // Though the ultimate reduction may have multiple uses, its condition must
23803 // have only single use.
23804 if (auto *Sel = dyn_cast<SelectInst>(Root))
23805 if (!Sel->getCondition()->hasOneUse())
23806 return false;
23807
23808 ReductionRoot = Root;
23809
23810 // Iterate through all the operands of the possible reduction tree and
23811 // gather all the reduced values, sorting them by their value id.
23812 BasicBlock *BB = Root->getParent();
23813 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23815 1, std::make_pair(Root, 0));
23816 // Checks if the operands of the \p TreeN instruction are also reduction
23817 // operations or should be treated as reduced values or an extra argument,
23818 // which is not part of the reduction.
23819 auto CheckOperands = [&](Instruction *TreeN,
23820 SmallVectorImpl<Value *> &PossibleReducedVals,
23821 SmallVectorImpl<Instruction *> &ReductionOps,
23822 unsigned Level) {
23823 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
23824 getNumberOfOperands(TreeN)))) {
23825 Value *EdgeVal = getRdxOperand(TreeN, I);
23826 ReducedValsToOps[EdgeVal].push_back(TreeN);
23827 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
23828 // If the edge is not an instruction, or it is different from the main
23829 // reduction opcode or has too many uses - possible reduced value.
23830 // Also, do not try to reduce const values, if the operation is not
23831 // foldable.
23832 if (!EdgeInst || Level > RecursionMaxDepth ||
23833 getRdxKind(EdgeInst) != RdxKind ||
23834 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23835 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23836 !isVectorizable(RdxKind, EdgeInst) ||
23837 (R.isAnalyzedReductionRoot(EdgeInst) &&
23838 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
23839 PossibleReducedVals.push_back(EdgeVal);
23840 continue;
23841 }
23842 ReductionOps.push_back(EdgeInst);
23843 }
23844 };
23845 // Try to regroup reduced values so that it gets more profitable to try to
23846 // reduce them. Values are grouped by their value ids, instructions - by
23847 // instruction op id and/or alternate op id, plus do extra analysis for
23848 // loads (grouping them by the distance between pointers) and cmp
23849 // instructions (grouping them by the predicate).
23850 SmallMapVector<
23851 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
23852 8>
23853 PossibleReducedVals;
23854 initReductionOps(Root);
23855 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
23856 SmallSet<size_t, 2> LoadKeyUsed;
23857
23858 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
23860 Value *Ptr =
23862 if (!LoadKeyUsed.insert(Key).second) {
23863 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
23864 if (LIt != LoadsMap.end()) {
23865 for (LoadInst *RLI : LIt->second) {
23866 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
23867 LI->getType(), LI->getPointerOperand(), DL, SE,
23868 /*StrictCheck=*/true))
23869 return hash_value(RLI->getPointerOperand());
23870 }
23871 for (LoadInst *RLI : LIt->second) {
23873 LI->getPointerOperand(), TLI)) {
23874 hash_code SubKey = hash_value(RLI->getPointerOperand());
23875 return SubKey;
23876 }
23877 }
23878 if (LIt->second.size() > 2) {
23879 hash_code SubKey =
23880 hash_value(LIt->second.back()->getPointerOperand());
23881 return SubKey;
23882 }
23883 }
23884 }
23885 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
23886 .first->second.push_back(LI);
23887 return hash_value(LI->getPointerOperand());
23888 };
23889
23890 while (!Worklist.empty()) {
23891 auto [TreeN, Level] = Worklist.pop_back_val();
23892 SmallVector<Value *> PossibleRedVals;
23893 SmallVector<Instruction *> PossibleReductionOps;
23894 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
23895 addReductionOps(TreeN);
23896 // Add reduction values. The values are sorted for better vectorization
23897 // results.
23898 for (Value *V : PossibleRedVals) {
23899 size_t Key, Idx;
23900 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
23901 /*AllowAlternate=*/false);
23902 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
23903 }
23904 for (Instruction *I : reverse(PossibleReductionOps))
23905 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
23906 }
23907 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
23908 // Sort values by the total number of values kinds to start the reduction
23909 // from the longest possible reduced values sequences.
23910 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
23911 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
23912 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
23913 for (auto &Slice : PossibleRedVals) {
23914 PossibleRedValsVect.emplace_back();
23915 auto RedValsVect = Slice.second.takeVector();
23916 stable_sort(RedValsVect, llvm::less_second());
23917 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
23918 PossibleRedValsVect.back().append(Data.second, Data.first);
23919 }
23920 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
23921 return P1.size() > P2.size();
23922 });
23923 bool First = true;
23924 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
23925 if (First) {
23926 First = false;
23927 ReducedVals.emplace_back();
23928 } else if (!isGoodForReduction(Data)) {
23929 auto *LI = dyn_cast<LoadInst>(Data.front());
23930 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
23931 if (!LI || !LastLI ||
23933 getUnderlyingObject(LastLI->getPointerOperand()))
23934 ReducedVals.emplace_back();
23935 }
23936 ReducedVals.back().append(Data.rbegin(), Data.rend());
23937 }
23938 }
23939 // Sort the reduced values by number of same/alternate opcode and/or pointer
23940 // operand.
23941 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
23942 return P1.size() > P2.size();
23943 });
23944 return true;
23945 }
23946
23947 /// Attempt to vectorize the tree found by matchAssociativeReduction.
23948 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
23949 const TargetLibraryInfo &TLI, AssumptionCache *AC,
23950 DominatorTree &DT) {
23951 constexpr unsigned RegMaxNumber = 4;
23952 constexpr unsigned RedValsMaxNumber = 128;
23953 // If there are a sufficient number of reduction values, reduce
23954 // to a nearby power-of-2. We can safely generate oversized
23955 // vectors and rely on the backend to split them to legal sizes.
23956 if (unsigned NumReducedVals = std::accumulate(
23957 ReducedVals.begin(), ReducedVals.end(), 0,
23958 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
23959 if (!isGoodForReduction(Vals))
23960 return Num;
23961 return Num + Vals.size();
23962 });
23963 NumReducedVals < ReductionLimit &&
23964 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
23965 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
23966 })) {
23967 for (ReductionOpsType &RdxOps : ReductionOps)
23968 for (Value *RdxOp : RdxOps)
23969 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
23970 return nullptr;
23971 }
23972
23973 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
23974 TargetFolder(DL));
23975 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
23976
23977 // Track the reduced values in case if they are replaced by extractelement
23978 // because of the vectorization.
23979 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
23980 ReducedVals.front().size());
23981
23982 // The compare instruction of a min/max is the insertion point for new
23983 // instructions and may be replaced with a new compare instruction.
23984 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
23985 assert(isa<SelectInst>(RdxRootInst) &&
23986 "Expected min/max reduction to have select root instruction");
23987 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
23988 assert(isa<Instruction>(ScalarCond) &&
23989 "Expected min/max reduction to have compare condition");
23990 return cast<Instruction>(ScalarCond);
23991 };
23992
23993 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
23994 return isBoolLogicOp(cast<Instruction>(V));
23995 });
23996 // Return new VectorizedTree, based on previous value.
23997 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
23998 if (VectorizedTree) {
23999 // Update the final value in the reduction.
24001 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
24002 if (AnyBoolLogicOp) {
24003 auto It = ReducedValsToOps.find(VectorizedTree);
24004 auto It1 = ReducedValsToOps.find(Res);
24005 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
24006 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
24007 (It != ReducedValsToOps.end() &&
24008 any_of(It->getSecond(), [&](Instruction *I) {
24009 return isBoolLogicOp(I) &&
24010 getRdxOperand(I, 0) == VectorizedTree;
24011 }))) {
24012 ;
24013 } else if (isGuaranteedNotToBePoison(Res, AC) ||
24014 (It1 != ReducedValsToOps.end() &&
24015 any_of(It1->getSecond(), [&](Instruction *I) {
24016 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24017 }))) {
24018 std::swap(VectorizedTree, Res);
24019 } else {
24020 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
24021 }
24022 }
24023
24024 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
24025 ReductionOps);
24026 }
24027 // Initialize the final value in the reduction.
24028 return Res;
24029 };
24030 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24031 ReductionOps.front().size());
24032 for (ReductionOpsType &RdxOps : ReductionOps)
24033 for (Value *RdxOp : RdxOps) {
24034 if (!RdxOp)
24035 continue;
24036 IgnoreList.insert(RdxOp);
24037 }
24038 // Intersect the fast-math-flags from all reduction operations.
24039 FastMathFlags RdxFMF;
24040 RdxFMF.set();
24041 for (Value *U : IgnoreList)
24042 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
24043 RdxFMF &= FPMO->getFastMathFlags();
24044 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
24045
24046 // Need to track reduced vals, they may be changed during vectorization of
24047 // subvectors.
24048 for (ArrayRef<Value *> Candidates : ReducedVals)
24049 for (Value *V : Candidates)
24050 TrackedVals.try_emplace(V, V);
24051
24052 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24053 Value *V) -> unsigned & {
24054 auto *It = MV.find(V);
24055 assert(It != MV.end() && "Unable to find given key.");
24056 return It->second;
24057 };
24058
24059 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
24060 // List of the values that were reduced in other trees as part of gather
24061 // nodes and thus requiring extract if fully vectorized in other trees.
24062 SmallPtrSet<Value *, 4> RequiredExtract;
24063 WeakTrackingVH VectorizedTree = nullptr;
24064 bool CheckForReusedReductionOps = false;
24065 // Try to vectorize elements based on their type.
24067 for (ArrayRef<Value *> RV : ReducedVals)
24068 States.push_back(getSameOpcode(RV, TLI));
24069 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
24070 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
24071 InstructionsState S = States[I];
24072 SmallVector<Value *> Candidates;
24073 Candidates.reserve(2 * OrigReducedVals.size());
24074 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
24075 for (Value *ReducedVal : OrigReducedVals) {
24076 Value *RdxVal = TrackedVals.at(ReducedVal);
24077 // Check if the reduction value was not overriden by the extractelement
24078 // instruction because of the vectorization and exclude it, if it is not
24079 // compatible with other values.
24080 // Also check if the instruction was folded to constant/other value.
24081 auto *Inst = dyn_cast<Instruction>(RdxVal);
24082 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
24083 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24084 (S && !Inst))
24085 continue;
24086 Candidates.push_back(RdxVal);
24087 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24088 }
24089 bool ShuffledExtracts = false;
24090 // Try to handle shuffled extractelements.
24091 if (S && S.getOpcode() == Instruction::ExtractElement &&
24092 !S.isAltShuffle() && I + 1 < E) {
24093 SmallVector<Value *> CommonCandidates(Candidates);
24094 for (Value *RV : ReducedVals[I + 1]) {
24095 Value *RdxVal = TrackedVals.at(RV);
24096 // Check if the reduction value was not overriden by the
24097 // extractelement instruction because of the vectorization and
24098 // exclude it, if it is not compatible with other values.
24099 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
24100 if (!Inst)
24101 continue;
24102 CommonCandidates.push_back(RdxVal);
24103 TrackedToOrig.try_emplace(RdxVal, RV);
24104 }
24105 SmallVector<int> Mask;
24106 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
24107 ++I;
24108 Candidates.swap(CommonCandidates);
24109 ShuffledExtracts = true;
24110 }
24111 }
24112
24113 // Emit code for constant values.
24114 if (Candidates.size() > 1 && allConstant(Candidates)) {
24115 Value *Res = Candidates.front();
24116 Value *OrigV = TrackedToOrig.at(Candidates.front());
24117 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24118 for (Value *VC : ArrayRef(Candidates).drop_front()) {
24119 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
24120 Value *OrigV = TrackedToOrig.at(VC);
24121 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24122 if (auto *ResI = dyn_cast<Instruction>(Res))
24123 V.analyzedReductionRoot(ResI);
24124 }
24125 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24126 continue;
24127 }
24128
24129 unsigned NumReducedVals = Candidates.size();
24130 if (NumReducedVals < ReductionLimit &&
24131 (NumReducedVals < 2 || !isSplat(Candidates)))
24132 continue;
24133
24134 // Check if we support repeated scalar values processing (optimization of
24135 // original scalar identity operations on matched horizontal reductions).
24136 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24137 RdxKind != RecurKind::FMul &&
24138 RdxKind != RecurKind::FMulAdd;
24139 // Gather same values.
24140 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24141 if (IsSupportedHorRdxIdentityOp)
24142 for (Value *V : Candidates) {
24143 Value *OrigV = TrackedToOrig.at(V);
24144 ++SameValuesCounter.try_emplace(OrigV).first->second;
24145 }
24146 // Used to check if the reduced values used same number of times. In this
24147 // case the compiler may produce better code. E.g. if reduced values are
24148 // aabbccdd (8 x values), then the first node of the tree will have a node
24149 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
24150 // Plus, the final reduction will be performed on <8 x aabbccdd>.
24151 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
24152 // x abcd) * 2.
24153 // Currently it only handles add/fadd/xor. and/or/min/max do not require
24154 // this analysis, other operations may require an extra estimation of
24155 // the profitability.
24156 bool SameScaleFactor = false;
24157 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24158 SameValuesCounter.size() != Candidates.size();
24159 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
24160 if (OptReusedScalars) {
24161 SameScaleFactor =
24162 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24163 RdxKind == RecurKind::Xor) &&
24164 all_of(drop_begin(SameValuesCounter),
24165 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
24166 return P.second == SameValuesCounter.front().second;
24167 });
24168 Candidates.resize(SameValuesCounter.size());
24169 transform(SameValuesCounter, Candidates.begin(),
24170 [&](const auto &P) { return TrackedVals.at(P.first); });
24171 NumReducedVals = Candidates.size();
24172 // Have a reduction of the same element.
24173 if (NumReducedVals == 1) {
24174 Value *OrigV = TrackedToOrig.at(Candidates.front());
24175 unsigned Cnt = At(SameValuesCounter, OrigV);
24176 Value *RedVal =
24177 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24178 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24179 VectorizedVals.try_emplace(OrigV, Cnt);
24180 ExternallyUsedValues.insert(OrigV);
24181 continue;
24182 }
24183 }
24184
24185 unsigned MaxVecRegSize = V.getMaxVecRegSize();
24186 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
24187 const unsigned MaxElts = std::clamp<unsigned>(
24188 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
24189 RegMaxNumber * RedValsMaxNumber);
24190
24191 unsigned ReduxWidth = NumReducedVals;
24192 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
24193 unsigned NumParts, NumRegs;
24194 Type *ScalarTy = Candidates.front()->getType();
24195 ReduxWidth =
24196 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
24197 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24198 NumParts = ::getNumberOfParts(TTI, Tp);
24199 NumRegs =
24201 while (NumParts > NumRegs) {
24202 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
24203 ReduxWidth = bit_floor(ReduxWidth - 1);
24204 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24205 NumParts = ::getNumberOfParts(TTI, Tp);
24206 NumRegs =
24208 }
24209 if (NumParts > NumRegs / 2)
24210 ReduxWidth = bit_floor(ReduxWidth);
24211 return ReduxWidth;
24212 };
24213 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
24214 ReduxWidth = GetVectorFactor(ReduxWidth);
24215 ReduxWidth = std::min(ReduxWidth, MaxElts);
24216
24217 unsigned Start = 0;
24218 unsigned Pos = Start;
24219 // Restarts vectorization attempt with lower vector factor.
24220 unsigned PrevReduxWidth = ReduxWidth;
24221 bool CheckForReusedReductionOpsLocal = false;
24222 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
24223 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
24224 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24225 // Check if any of the reduction ops are gathered. If so, worth
24226 // trying again with less number of reduction ops.
24227 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24228 }
24229 ++Pos;
24230 if (Pos < NumReducedVals - ReduxWidth + 1)
24231 return IsAnyRedOpGathered;
24232 Pos = Start;
24233 --ReduxWidth;
24234 if (ReduxWidth > 1)
24235 ReduxWidth = GetVectorFactor(ReduxWidth);
24236 return IsAnyRedOpGathered;
24237 };
24238 bool AnyVectorized = false;
24239 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24240 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24241 ReduxWidth >= ReductionLimit) {
24242 // Dependency in tree of the reduction ops - drop this attempt, try
24243 // later.
24244 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24245 Start == 0) {
24246 CheckForReusedReductionOps = true;
24247 break;
24248 }
24249 PrevReduxWidth = ReduxWidth;
24250 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
24251 // Been analyzed already - skip.
24252 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
24253 (!has_single_bit(ReduxWidth) &&
24254 (IgnoredCandidates.contains(
24255 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
24256 IgnoredCandidates.contains(
24257 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
24258 bit_floor(ReduxWidth))))) ||
24259 V.areAnalyzedReductionVals(VL)) {
24260 (void)AdjustReducedVals(/*IgnoreVL=*/true);
24261 continue;
24262 }
24263 // Early exit if any of the reduction values were deleted during
24264 // previous vectorization attempts.
24265 if (any_of(VL, [&V](Value *RedVal) {
24266 auto *RedValI = dyn_cast<Instruction>(RedVal);
24267 return RedValI && V.isDeleted(RedValI);
24268 }))
24269 break;
24270 V.buildTree(VL, IgnoreList);
24271 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
24272 if (!AdjustReducedVals())
24273 V.analyzedReductionVals(VL);
24274 continue;
24275 }
24276 if (V.isLoadCombineReductionCandidate(RdxKind)) {
24277 if (!AdjustReducedVals())
24278 V.analyzedReductionVals(VL);
24279 continue;
24280 }
24281 V.reorderTopToBottom();
24282 // No need to reorder the root node at all for reassociative reduction.
24283 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
24284 VL.front()->getType()->isIntOrIntVectorTy() ||
24285 ReductionLimit > 2);
24286 // Keep extracted other reduction values, if they are used in the
24287 // vectorization trees.
24288 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
24289 ExternallyUsedValues);
24290 // The reduction root is used as the insertion point for new
24291 // instructions, so set it as externally used to prevent it from being
24292 // deleted.
24293 LocalExternallyUsedValues.insert(ReductionRoot);
24294 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
24295 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
24296 continue;
24297 for (Value *V : ReducedVals[Cnt])
24298 if (isa<Instruction>(V))
24299 LocalExternallyUsedValues.insert(TrackedVals[V]);
24300 }
24301 if (!IsSupportedHorRdxIdentityOp) {
24302 // Number of uses of the candidates in the vector of values.
24303 assert(SameValuesCounter.empty() &&
24304 "Reused values counter map is not empty");
24305 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24306 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24307 continue;
24308 Value *V = Candidates[Cnt];
24309 Value *OrigV = TrackedToOrig.at(V);
24310 ++SameValuesCounter.try_emplace(OrigV).first->second;
24311 }
24312 }
24313 V.transformNodes();
24314 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
24315 // Gather externally used values.
24316 SmallPtrSet<Value *, 4> Visited;
24317 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24318 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24319 continue;
24320 Value *RdxVal = Candidates[Cnt];
24321 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24322 RdxVal = It->second;
24323 if (!Visited.insert(RdxVal).second)
24324 continue;
24325 // Check if the scalar was vectorized as part of the vectorization
24326 // tree but not the top node.
24327 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
24328 LocalExternallyUsedValues.insert(RdxVal);
24329 continue;
24330 }
24331 Value *OrigV = TrackedToOrig.at(RdxVal);
24332 unsigned NumOps =
24333 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24334 if (NumOps != ReducedValsToOps.at(OrigV).size())
24335 LocalExternallyUsedValues.insert(RdxVal);
24336 }
24337 // Do not need the list of reused scalars in regular mode anymore.
24338 if (!IsSupportedHorRdxIdentityOp)
24339 SameValuesCounter.clear();
24340 for (Value *RdxVal : VL)
24341 if (RequiredExtract.contains(RdxVal))
24342 LocalExternallyUsedValues.insert(RdxVal);
24343 V.buildExternalUses(LocalExternallyUsedValues);
24344
24345 V.computeMinimumValueSizes();
24346
24347 // Estimate cost.
24348 InstructionCost ReductionCost =
24349 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
24350 InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
24351 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24352 << " for reduction\n");
24353 if (!Cost.isValid())
24354 break;
24355 if (Cost >= -SLPCostThreshold) {
24356 V.getORE()->emit([&]() {
24357 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
24358 ReducedValsToOps.at(VL[0]).front())
24359 << "Vectorizing horizontal reduction is possible "
24360 << "but not beneficial with cost " << ore::NV("Cost", Cost)
24361 << " and threshold "
24362 << ore::NV("Threshold", -SLPCostThreshold);
24363 });
24364 if (!AdjustReducedVals()) {
24365 V.analyzedReductionVals(VL);
24366 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24367 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
24368 // Add subvectors of VL to the list of the analyzed values.
24369 for (unsigned VF = getFloorFullVectorNumberOfElements(
24370 *TTI, VL.front()->getType(), ReduxWidth - 1);
24371 VF >= ReductionLimit;
24373 *TTI, VL.front()->getType(), VF - 1)) {
24374 if (has_single_bit(VF) &&
24375 V.getCanonicalGraphSize() != V.getTreeSize())
24376 continue;
24377 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24378 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
24379 }
24380 }
24381 }
24382 continue;
24383 }
24384
24385 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
24386 << Cost << ". (HorRdx)\n");
24387 V.getORE()->emit([&]() {
24388 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
24389 ReducedValsToOps.at(VL[0]).front())
24390 << "Vectorized horizontal reduction with cost "
24391 << ore::NV("Cost", Cost) << " and with tree size "
24392 << ore::NV("TreeSize", V.getTreeSize());
24393 });
24394
24395 Builder.setFastMathFlags(RdxFMF);
24396
24397 // Emit a reduction. If the root is a select (min/max idiom), the insert
24398 // point is the compare condition of that select.
24399 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24400 Instruction *InsertPt = RdxRootInst;
24401 if (IsCmpSelMinMax)
24402 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24403
24404 // Vectorize a tree.
24405 Value *VectorizedRoot = V.vectorizeTree(
24406 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24407 // Update TrackedToOrig mapping, since the tracked values might be
24408 // updated.
24409 for (Value *RdxVal : Candidates) {
24410 Value *OrigVal = TrackedToOrig.at(RdxVal);
24411 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24412 if (TransformedRdxVal != RdxVal)
24413 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24414 }
24415
24416 Builder.SetInsertPoint(InsertPt);
24417
24418 // To prevent poison from leaking across what used to be sequential,
24419 // safe, scalar boolean logic operations, the reduction operand must be
24420 // frozen.
24421 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
24422 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
24423
24424 // Emit code to correctly handle reused reduced values, if required.
24425 if (OptReusedScalars && !SameScaleFactor) {
24426 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24427 SameValuesCounter, TrackedToOrig);
24428 }
24429
24430 Type *ScalarTy = VL.front()->getType();
24431 Type *VecTy = VectorizedRoot->getType();
24432 Type *RedScalarTy = VecTy->getScalarType();
24433 VectorValuesAndScales.emplace_back(
24434 VectorizedRoot,
24435 OptReusedScalars && SameScaleFactor
24436 ? SameValuesCounter.front().second
24437 : 1,
24438 RedScalarTy != ScalarTy->getScalarType()
24439 ? V.isSignedMinBitwidthRootNode()
24440 : true);
24441
24442 // Count vectorized reduced values to exclude them from final reduction.
24443 for (Value *RdxVal : VL) {
24444 Value *OrigV = TrackedToOrig.at(RdxVal);
24445 if (IsSupportedHorRdxIdentityOp) {
24446 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24447 continue;
24448 }
24449 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24450 if (!V.isVectorized(RdxVal))
24451 RequiredExtract.insert(RdxVal);
24452 }
24453 Pos += ReduxWidth;
24454 Start = Pos;
24455 ReduxWidth = NumReducedVals - Pos;
24456 if (ReduxWidth > 1)
24457 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24458 AnyVectorized = true;
24459 }
24460 if (OptReusedScalars && !AnyVectorized) {
24461 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
24462 Value *RdxVal = TrackedVals.at(P.first);
24463 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
24464 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24465 VectorizedVals.try_emplace(P.first, P.second);
24466 }
24467 continue;
24468 }
24469 }
24470 if (!VectorValuesAndScales.empty())
24471 VectorizedTree = GetNewVectorizedTree(
24472 VectorizedTree,
24473 emitReduction(Builder, *TTI, ReductionRoot->getType()));
24474
24475 if (!VectorizedTree) {
24476 if (!CheckForReusedReductionOps) {
24477 for (ReductionOpsType &RdxOps : ReductionOps)
24478 for (Value *RdxOp : RdxOps)
24479 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24480 }
24481 return nullptr;
24482 }
24483
24484 // Reorder operands of bool logical op in the natural order to avoid
24485 // possible problem with poison propagation. If not possible to reorder
24486 // (both operands are originally RHS), emit an extra freeze instruction
24487 // for the LHS operand.
24488 // I.e., if we have original code like this:
24489 // RedOp1 = select i1 ?, i1 LHS, i1 false
24490 // RedOp2 = select i1 RHS, i1 ?, i1 false
24491
24492 // Then, we swap LHS/RHS to create a new op that matches the poison
24493 // semantics of the original code.
24494
24495 // If we have original code like this and both values could be poison:
24496 // RedOp1 = select i1 ?, i1 LHS, i1 false
24497 // RedOp2 = select i1 ?, i1 RHS, i1 false
24498
24499 // Then, we must freeze LHS in the new op.
24500 auto FixBoolLogicalOps =
24501 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
24502 Instruction *RedOp2, bool InitStep) {
24503 if (!AnyBoolLogicOp)
24504 return;
24505 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
24506 getRdxOperand(RedOp1, 0) == LHS ||
24508 return;
24509 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
24510 getRdxOperand(RedOp2, 0) == RHS ||
24512 std::swap(LHS, RHS);
24513 return;
24514 }
24515 if (LHS != VectorizedTree)
24516 LHS = Builder.CreateFreeze(LHS);
24517 };
24518 // Finish the reduction.
24519 // Need to add extra arguments and not vectorized possible reduction values.
24520 // Try to avoid dependencies between the scalar remainders after reductions.
24521 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
24522 bool InitStep) {
24523 unsigned Sz = InstVals.size();
24524 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
24525 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
24526 Instruction *RedOp = InstVals[I + 1].first;
24527 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
24528 Value *RdxVal1 = InstVals[I].second;
24529 Value *StableRdxVal1 = RdxVal1;
24530 auto It1 = TrackedVals.find(RdxVal1);
24531 if (It1 != TrackedVals.end())
24532 StableRdxVal1 = It1->second;
24533 Value *RdxVal2 = InstVals[I + 1].second;
24534 Value *StableRdxVal2 = RdxVal2;
24535 auto It2 = TrackedVals.find(RdxVal2);
24536 if (It2 != TrackedVals.end())
24537 StableRdxVal2 = It2->second;
24538 // To prevent poison from leaking across what used to be sequential,
24539 // safe, scalar boolean logic operations, the reduction operand must be
24540 // frozen.
24541 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
24542 RedOp, InitStep);
24543 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24544 StableRdxVal2, "op.rdx", ReductionOps);
24545 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
24546 }
24547 if (Sz % 2 == 1)
24548 ExtraReds[Sz / 2] = InstVals.back();
24549 return ExtraReds;
24550 };
24552 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
24553 VectorizedTree);
24554 SmallPtrSet<Value *, 8> Visited;
24555 for (ArrayRef<Value *> Candidates : ReducedVals) {
24556 for (Value *RdxVal : Candidates) {
24557 if (!Visited.insert(RdxVal).second)
24558 continue;
24559 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24560 for (Instruction *RedOp :
24561 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
24562 ExtraReductions.emplace_back(RedOp, RdxVal);
24563 }
24564 }
24565 // Iterate through all not-vectorized reduction values/extra arguments.
24566 bool InitStep = true;
24567 while (ExtraReductions.size() > 1) {
24569 FinalGen(ExtraReductions, InitStep);
24570 ExtraReductions.swap(NewReds);
24571 InitStep = false;
24572 }
24573 VectorizedTree = ExtraReductions.front().second;
24574
24575 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24576
24577 // The original scalar reduction is expected to have no remaining
24578 // uses outside the reduction tree itself. Assert that we got this
24579 // correct, replace internal uses with undef, and mark for eventual
24580 // deletion.
24581#ifndef NDEBUG
24582 SmallPtrSet<Value *, 4> IgnoreSet;
24583 for (ArrayRef<Value *> RdxOps : ReductionOps)
24584 IgnoreSet.insert_range(RdxOps);
24585#endif
24586 for (ArrayRef<Value *> RdxOps : ReductionOps) {
24587 for (Value *Ignore : RdxOps) {
24588 if (!Ignore)
24589 continue;
24590#ifndef NDEBUG
24591 for (auto *U : Ignore->users()) {
24592 assert(IgnoreSet.count(U) &&
24593 "All users must be either in the reduction ops list.");
24594 }
24595#endif
24596 if (!Ignore->use_empty()) {
24597 Value *P = PoisonValue::get(Ignore->getType());
24598 Ignore->replaceAllUsesWith(P);
24599 }
24600 }
24601 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24602 }
24603 return VectorizedTree;
24604 }
24605
24606private:
24607 /// Creates the reduction from the given \p Vec vector value with the given
24608 /// scale \p Scale and signedness \p IsSigned.
24609 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24610 Value *Vec, unsigned Scale, bool IsSigned,
24611 Type *DestTy) {
24612 Value *Rdx;
24613 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
24614 unsigned DestTyNumElements = getNumElements(VecTy);
24615 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
24616 Rdx = PoisonValue::get(
24617 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
24618 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
24619 // Do reduction for each lane.
24620 // e.g., do reduce add for
24621 // VL[0] = <4 x Ty> <a, b, c, d>
24622 // VL[1] = <4 x Ty> <e, f, g, h>
24623 // Lane[0] = <2 x Ty> <a, e>
24624 // Lane[1] = <2 x Ty> <b, f>
24625 // Lane[2] = <2 x Ty> <c, g>
24626 // Lane[3] = <2 x Ty> <d, h>
24627 // result[0] = reduce add Lane[0]
24628 // result[1] = reduce add Lane[1]
24629 // result[2] = reduce add Lane[2]
24630 // result[3] = reduce add Lane[3]
24631 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
24632 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
24633 Rdx = Builder.CreateInsertElement(
24634 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
24635 }
24636 } else {
24637 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
24638 }
24639 if (Rdx->getType() != DestTy)
24640 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
24641 // Improved analysis for add/fadd/xor reductions with same scale
24642 // factor for all operands of reductions. We can emit scalar ops for
24643 // them instead.
24644 if (Scale > 1)
24645 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24646 return Rdx;
24647 }
24648
24649 /// Calculate the cost of a reduction.
24650 InstructionCost getReductionCost(TargetTransformInfo *TTI,
24651 ArrayRef<Value *> ReducedVals,
24652 bool IsCmpSelMinMax, FastMathFlags FMF,
24653 const BoUpSLP &R, DominatorTree &DT,
24654 const DataLayout &DL,
24655 const TargetLibraryInfo &TLI) {
24657 Type *ScalarTy = ReducedVals.front()->getType();
24658 unsigned ReduxWidth = ReducedVals.size();
24659 FixedVectorType *VectorTy = R.getReductionType();
24660 InstructionCost VectorCost = 0, ScalarCost;
24661 // If all of the reduced values are constant, the vector cost is 0, since
24662 // the reduction value can be calculated at the compile time.
24663 bool AllConsts = allConstant(ReducedVals);
24664 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
24666 // Scalar cost is repeated for N-1 elements.
24667 int Cnt = ReducedVals.size();
24668 for (Value *RdxVal : ReducedVals) {
24669 if (Cnt == 1)
24670 break;
24671 --Cnt;
24672 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
24673 Cost += GenCostFn();
24674 continue;
24675 }
24676 InstructionCost ScalarCost = 0;
24677 for (User *U : RdxVal->users()) {
24678 auto *RdxOp = cast<Instruction>(U);
24679 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24680 if (RdxKind == RecurKind::FAdd) {
24682 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
24683 if (FMACost.isValid()) {
24684 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
24685 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
24686 // Also, exclude scalar fmul cost.
24687 InstructionCost FMulCost =
24689 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
24690 FMACost -= FMulCost;
24691 }
24692 ScalarCost += FMACost;
24693 continue;
24694 }
24695 }
24696 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
24697 continue;
24698 }
24699 ScalarCost = InstructionCost::getInvalid();
24700 break;
24701 }
24702 if (ScalarCost.isValid())
24703 Cost += ScalarCost;
24704 else
24705 Cost += GenCostFn();
24706 }
24707 return Cost;
24708 };
24709 // Require reduction cost if:
24710 // 1. This type is not a full register type and no other vectors with the
24711 // same type in the storage (first vector with small type).
24712 // 2. The storage does not have any vector with full vector use (first
24713 // vector with full register use).
24714 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
24715 switch (RdxKind) {
24716 case RecurKind::Add:
24717 case RecurKind::Mul:
24718 case RecurKind::Or:
24719 case RecurKind::And:
24720 case RecurKind::Xor:
24721 case RecurKind::FAdd:
24722 case RecurKind::FMul: {
24723 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
24724 if (!AllConsts) {
24725 if (DoesRequireReductionOp) {
24726 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
24727 assert(SLPReVec && "FixedVectorType is not expected.");
24728 unsigned ScalarTyNumElements = VecTy->getNumElements();
24729 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
24730 VectorCost += TTI->getShuffleCost(
24733 ReducedVals.size()),
24734 VectorTy,
24735 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
24736 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
24737 FMF, CostKind);
24738 }
24739 VectorCost += TTI->getScalarizationOverhead(
24740 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
24741 /*Extract*/ false, TTI::TCK_RecipThroughput);
24742 } else {
24743 Type *RedTy = VectorTy->getElementType();
24744 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24745 std::make_pair(RedTy, true));
24746 if (RType == RedTy) {
24747 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
24748 FMF, CostKind);
24749 } else {
24750 VectorCost = TTI->getExtendedReductionCost(
24751 RdxOpcode, !IsSigned, RedTy,
24752 getWidenedType(RType, ReduxWidth), FMF, CostKind);
24753 }
24754 }
24755 } else {
24756 Type *RedTy = VectorTy->getElementType();
24757 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24758 std::make_pair(RedTy, true));
24759 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24760 InstructionCost FMACost = InstructionCost::getInvalid();
24761 if (RdxKind == RecurKind::FAdd) {
24762 // Check if the reduction operands can be converted to FMA.
24764 FastMathFlags FMF;
24765 FMF.set();
24766 for (Value *RdxVal : ReducedVals) {
24767 if (!RdxVal->hasOneUse()) {
24768 Ops.clear();
24769 break;
24770 }
24771 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
24772 FMF &= FPCI->getFastMathFlags();
24773 Ops.push_back(RdxVal->user_back());
24774 }
24775 if (!Ops.empty()) {
24776 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
24777 *TTI, TLI);
24778 if (FMACost.isValid()) {
24779 // Calculate actual FMAD cost.
24780 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24781 {RVecTy, RVecTy, RVecTy}, FMF);
24782 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
24783
24784 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
24785 // Also, exclude vector fmul cost.
24787 Instruction::FMul, RVecTy, CostKind);
24789 << "Minus vector FMul cost: " << FMulCost << "\n");
24790 FMACost -= FMulCost;
24791 }
24792 }
24793 }
24794 if (FMACost.isValid())
24795 VectorCost += FMACost;
24796 else
24797 VectorCost +=
24798 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
24799 if (RType != RedTy) {
24800 unsigned Opcode = Instruction::Trunc;
24801 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24802 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24803 VectorCost += TTI->getCastInstrCost(
24804 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24805 }
24806 }
24807 }
24808 ScalarCost = EvaluateScalarCost([&]() {
24809 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
24810 });
24811 break;
24812 }
24813 case RecurKind::FMax:
24814 case RecurKind::FMin:
24815 case RecurKind::FMaximum:
24816 case RecurKind::FMinimum:
24817 case RecurKind::SMax:
24818 case RecurKind::SMin:
24819 case RecurKind::UMax:
24820 case RecurKind::UMin: {
24822 if (!AllConsts) {
24823 if (DoesRequireReductionOp) {
24824 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
24825 } else {
24826 // Check if the previous reduction already exists and account it as
24827 // series of operations + single reduction.
24828 Type *RedTy = VectorTy->getElementType();
24829 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24830 std::make_pair(RedTy, true));
24831 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24832 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24833 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
24834 if (RType != RedTy) {
24835 unsigned Opcode = Instruction::Trunc;
24836 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
24837 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24838 VectorCost += TTI->getCastInstrCost(
24839 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
24840 }
24841 }
24842 }
24843 ScalarCost = EvaluateScalarCost([&]() {
24844 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24845 return TTI->getIntrinsicInstrCost(ICA, CostKind);
24846 });
24847 break;
24848 }
24849 default:
24850 llvm_unreachable("Expected arithmetic or min/max reduction operation");
24851 }
24852
24853 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
24854 << " for reduction of " << shortBundleName(ReducedVals)
24855 << " (It is a splitting reduction)\n");
24856 return VectorCost - ScalarCost;
24857 }
24858
24859 /// Splits the values, stored in VectorValuesAndScales, into registers/free
24860 /// sub-registers, combines them with the given reduction operation as a
24861 /// vector operation and then performs single (small enough) reduction.
24862 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24863 Type *DestTy) {
24864 Value *ReducedSubTree = nullptr;
24865 // Creates reduction and combines with the previous reduction.
24866 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
24867 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
24868 if (ReducedSubTree)
24869 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
24870 "op.rdx", ReductionOps);
24871 else
24872 ReducedSubTree = Rdx;
24873 };
24874 if (VectorValuesAndScales.size() == 1) {
24875 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
24876 CreateSingleOp(Vec, Scale, IsSigned);
24877 return ReducedSubTree;
24878 }
24879 // Scales Vec using given Cnt scale factor and then performs vector combine
24880 // with previous value of VecOp.
24881 Value *VecRes = nullptr;
24882 bool VecResSignedness = false;
24883 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
24884 Type *ScalarTy = Vec->getType()->getScalarType();
24885 // Scale Vec using given Cnt scale factor.
24886 if (Cnt > 1) {
24887 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
24888 switch (RdxKind) {
24889 case RecurKind::Add: {
24890 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
24891 unsigned VF = getNumElements(Vec->getType());
24892 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
24893 << ". (HorRdx)\n");
24894 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
24895 for (unsigned I : seq<unsigned>(Cnt))
24896 std::iota(std::next(Mask.begin(), VF * I),
24897 std::next(Mask.begin(), VF * (I + 1)), 0);
24898 ++NumVectorInstructions;
24899 Vec = Builder.CreateShuffleVector(Vec, Mask);
24900 break;
24901 }
24902 // res = mul vv, n
24903 if (ScalarTy != DestTy->getScalarType())
24904 Vec = Builder.CreateIntCast(
24905 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
24906 IsSigned);
24908 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
24909 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
24910 << ". (HorRdx)\n");
24911 ++NumVectorInstructions;
24912 Vec = Builder.CreateMul(Vec, Scale);
24913 break;
24914 }
24915 case RecurKind::Xor: {
24916 // res = n % 2 ? 0 : vv
24918 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
24919 if (Cnt % 2 == 0)
24920 Vec = Constant::getNullValue(Vec->getType());
24921 break;
24922 }
24923 case RecurKind::FAdd: {
24924 // res = fmul v, n
24925 Value *Scale =
24926 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
24927 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
24928 << ". (HorRdx)\n");
24929 ++NumVectorInstructions;
24930 Vec = Builder.CreateFMul(Vec, Scale);
24931 break;
24932 }
24933 case RecurKind::And:
24934 case RecurKind::Or:
24935 case RecurKind::SMax:
24936 case RecurKind::SMin:
24937 case RecurKind::UMax:
24938 case RecurKind::UMin:
24939 case RecurKind::FMax:
24940 case RecurKind::FMin:
24941 case RecurKind::FMaximum:
24942 case RecurKind::FMinimum:
24943 // res = vv
24944 break;
24945 case RecurKind::Sub:
24946 case RecurKind::AddChainWithSubs:
24947 case RecurKind::Mul:
24948 case RecurKind::FMul:
24949 case RecurKind::FMulAdd:
24950 case RecurKind::AnyOf:
24951 case RecurKind::FindFirstIVSMin:
24952 case RecurKind::FindFirstIVUMin:
24953 case RecurKind::FindLastIVSMax:
24954 case RecurKind::FindLastIVUMax:
24955 case RecurKind::FMaxNum:
24956 case RecurKind::FMinNum:
24957 case RecurKind::FMaximumNum:
24958 case RecurKind::FMinimumNum:
24959 case RecurKind::None:
24960 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
24961 }
24962 }
24963 // Combine Vec with the previous VecOp.
24964 if (!VecRes) {
24965 VecRes = Vec;
24966 VecResSignedness = IsSigned;
24967 } else {
24968 ++NumVectorInstructions;
24969 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
24970 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
24971 // Handle ctpop.
24972 unsigned VecResVF = getNumElements(VecRes->getType());
24973 unsigned VecVF = getNumElements(Vec->getType());
24974 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
24975 std::iota(Mask.begin(), Mask.end(), 0);
24976 // Ensure that VecRes is always larger than Vec
24977 if (VecResVF < VecVF) {
24978 std::swap(VecRes, Vec);
24979 std::swap(VecResVF, VecVF);
24980 }
24981 if (VecResVF != VecVF) {
24982 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
24983 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
24984 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
24985 }
24986 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
24987 return;
24988 }
24989 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
24990 VecRes = Builder.CreateIntCast(
24991 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
24992 VecResSignedness);
24993 if (ScalarTy != DestTy->getScalarType())
24994 Vec = Builder.CreateIntCast(
24995 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
24996 IsSigned);
24997 unsigned VecResVF = getNumElements(VecRes->getType());
24998 unsigned VecVF = getNumElements(Vec->getType());
24999 // Ensure that VecRes is always larger than Vec
25000 if (VecResVF < VecVF) {
25001 std::swap(VecRes, Vec);
25002 std::swap(VecResVF, VecVF);
25003 }
25004 // extract + op + insert
25005 Value *Op = VecRes;
25006 if (VecResVF != VecVF)
25007 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
25008 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
25009 if (VecResVF != VecVF)
25010 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
25011 VecRes = Op;
25012 }
25013 };
25014 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25015 CreateVecOp(Vec, Scale, IsSigned);
25016 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
25017
25018 return ReducedSubTree;
25019 }
25020
25021 /// Emit a horizontal reduction of the vectorized value.
25022 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
25023 const TargetTransformInfo *TTI, Type *DestTy) {
25024 assert(VectorizedValue && "Need to have a vectorized tree node");
25025 assert(RdxKind != RecurKind::FMulAdd &&
25026 "A call to the llvm.fmuladd intrinsic is not handled yet");
25027
25028 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
25029 if (FTy->getScalarType() == Builder.getInt1Ty() &&
25030 RdxKind == RecurKind::Add &&
25031 DestTy->getScalarType() != FTy->getScalarType()) {
25032 // Convert vector_reduce_add(ZExt(<n x i1>)) to
25033 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
25034 Value *V = Builder.CreateBitCast(
25035 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
25036 ++NumVectorInstructions;
25037 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
25038 }
25039 ++NumVectorInstructions;
25040 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
25041 }
25042
25043 /// Emits optimized code for unique scalar value reused \p Cnt times.
25044 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
25045 unsigned Cnt) {
25046 assert(IsSupportedHorRdxIdentityOp &&
25047 "The optimization of matched scalar identity horizontal reductions "
25048 "must be supported.");
25049 if (Cnt == 1)
25050 return VectorizedValue;
25051 switch (RdxKind) {
25052 case RecurKind::Add: {
25053 // res = mul vv, n
25054 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
25055 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
25056 << VectorizedValue << ". (HorRdx)\n");
25057 return Builder.CreateMul(VectorizedValue, Scale);
25058 }
25059 case RecurKind::Xor: {
25060 // res = n % 2 ? 0 : vv
25061 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
25062 << ". (HorRdx)\n");
25063 if (Cnt % 2 == 0)
25064 return Constant::getNullValue(VectorizedValue->getType());
25065 return VectorizedValue;
25066 }
25067 case RecurKind::FAdd: {
25068 // res = fmul v, n
25069 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
25070 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
25071 << VectorizedValue << ". (HorRdx)\n");
25072 return Builder.CreateFMul(VectorizedValue, Scale);
25073 }
25074 case RecurKind::And:
25075 case RecurKind::Or:
25076 case RecurKind::SMax:
25077 case RecurKind::SMin:
25078 case RecurKind::UMax:
25079 case RecurKind::UMin:
25080 case RecurKind::FMax:
25081 case RecurKind::FMin:
25082 case RecurKind::FMaximum:
25083 case RecurKind::FMinimum:
25084 // res = vv
25085 return VectorizedValue;
25086 case RecurKind::Sub:
25087 case RecurKind::AddChainWithSubs:
25088 case RecurKind::Mul:
25089 case RecurKind::FMul:
25090 case RecurKind::FMulAdd:
25091 case RecurKind::AnyOf:
25092 case RecurKind::FindFirstIVSMin:
25093 case RecurKind::FindFirstIVUMin:
25094 case RecurKind::FindLastIVSMax:
25095 case RecurKind::FindLastIVUMax:
25096 case RecurKind::FMaxNum:
25097 case RecurKind::FMinNum:
25098 case RecurKind::FMaximumNum:
25099 case RecurKind::FMinimumNum:
25100 case RecurKind::None:
25101 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25102 }
25103 return nullptr;
25104 }
25105
25106 /// Emits actual operation for the scalar identity values, found during
25107 /// horizontal reduction analysis.
25108 Value *
25109 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25110 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25111 const DenseMap<Value *, Value *> &TrackedToOrig) {
25112 assert(IsSupportedHorRdxIdentityOp &&
25113 "The optimization of matched scalar identity horizontal reductions "
25114 "must be supported.");
25115 ArrayRef<Value *> VL = R.getRootNodeScalars();
25116 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
25117 if (VTy->getElementType() != VL.front()->getType()) {
25118 VectorizedValue = Builder.CreateIntCast(
25119 VectorizedValue,
25120 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
25121 R.isSignedMinBitwidthRootNode());
25122 }
25123 switch (RdxKind) {
25124 case RecurKind::Add: {
25125 // root = mul prev_root, <1, 1, n, 1>
25127 for (Value *V : VL) {
25128 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25129 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
25130 }
25131 auto *Scale = ConstantVector::get(Vals);
25132 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
25133 << VectorizedValue << ". (HorRdx)\n");
25134 return Builder.CreateMul(VectorizedValue, Scale);
25135 }
25136 case RecurKind::And:
25137 case RecurKind::Or:
25138 // No need for multiple or/and(s).
25139 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
25140 << ". (HorRdx)\n");
25141 return VectorizedValue;
25142 case RecurKind::SMax:
25143 case RecurKind::SMin:
25144 case RecurKind::UMax:
25145 case RecurKind::UMin:
25146 case RecurKind::FMax:
25147 case RecurKind::FMin:
25148 case RecurKind::FMaximum:
25149 case RecurKind::FMinimum:
25150 // No need for multiple min/max(s) of the same value.
25151 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
25152 << ". (HorRdx)\n");
25153 return VectorizedValue;
25154 case RecurKind::Xor: {
25155 // Replace values with even number of repeats with 0, since
25156 // x xor x = 0.
25157 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
25158 // 7>, if elements 4th and 6th elements have even number of repeats.
25159 SmallVector<int> Mask(
25160 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
25162 std::iota(Mask.begin(), Mask.end(), 0);
25163 bool NeedShuffle = false;
25164 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
25165 Value *V = VL[I];
25166 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25167 if (Cnt % 2 == 0) {
25168 Mask[I] = VF;
25169 NeedShuffle = true;
25170 }
25171 }
25172 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
25173 : Mask) dbgs()
25174 << I << " ";
25175 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
25176 if (NeedShuffle)
25177 VectorizedValue = Builder.CreateShuffleVector(
25178 VectorizedValue,
25179 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
25180 return VectorizedValue;
25181 }
25182 case RecurKind::FAdd: {
25183 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
25185 for (Value *V : VL) {
25186 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25187 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
25188 }
25189 auto *Scale = ConstantVector::get(Vals);
25190 return Builder.CreateFMul(VectorizedValue, Scale);
25191 }
25192 case RecurKind::Sub:
25193 case RecurKind::AddChainWithSubs:
25194 case RecurKind::Mul:
25195 case RecurKind::FMul:
25196 case RecurKind::FMulAdd:
25197 case RecurKind::AnyOf:
25198 case RecurKind::FindFirstIVSMin:
25199 case RecurKind::FindFirstIVUMin:
25200 case RecurKind::FindLastIVSMax:
25201 case RecurKind::FindLastIVUMax:
25202 case RecurKind::FMaxNum:
25203 case RecurKind::FMinNum:
25204 case RecurKind::FMaximumNum:
25205 case RecurKind::FMinimumNum:
25206 case RecurKind::None:
25207 llvm_unreachable("Unexpected reduction kind for reused scalars.");
25208 }
25209 return nullptr;
25210 }
25211};
25212} // end anonymous namespace
25213
25214/// Gets recurrence kind from the specified value.
25216 return HorizontalReduction::getRdxKind(V);
25217}
25218static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
25219 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25220 return cast<FixedVectorType>(IE->getType())->getNumElements();
25221
25222 unsigned AggregateSize = 1;
25223 auto *IV = cast<InsertValueInst>(InsertInst);
25224 Type *CurrentType = IV->getType();
25225 do {
25226 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
25227 for (auto *Elt : ST->elements())
25228 if (Elt != ST->getElementType(0)) // check homogeneity
25229 return std::nullopt;
25230 AggregateSize *= ST->getNumElements();
25231 CurrentType = ST->getElementType(0);
25232 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25233 AggregateSize *= AT->getNumElements();
25234 CurrentType = AT->getElementType();
25235 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25236 AggregateSize *= VT->getNumElements();
25237 return AggregateSize;
25238 } else if (CurrentType->isSingleValueType()) {
25239 return AggregateSize;
25240 } else {
25241 return std::nullopt;
25242 }
25243 } while (true);
25244}
25245
25246static void findBuildAggregateRec(Instruction *LastInsertInst,
25248 SmallVectorImpl<Value *> &BuildVectorOpds,
25249 SmallVectorImpl<Value *> &InsertElts,
25250 unsigned OperandOffset, const BoUpSLP &R) {
25251 do {
25252 Value *InsertedOperand = LastInsertInst->getOperand(1);
25253 std::optional<unsigned> OperandIndex =
25254 getElementIndex(LastInsertInst, OperandOffset);
25255 if (!OperandIndex || R.isDeleted(LastInsertInst))
25256 return;
25257 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25259 BuildVectorOpds, InsertElts, *OperandIndex, R);
25260
25261 } else {
25262 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25263 InsertElts[*OperandIndex] = LastInsertInst;
25264 }
25265 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
25266 } while (LastInsertInst != nullptr &&
25268 LastInsertInst->hasOneUse());
25269}
25270
25271/// Recognize construction of vectors like
25272/// %ra = insertelement <4 x float> poison, float %s0, i32 0
25273/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
25274/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
25275/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
25276/// starting from the last insertelement or insertvalue instruction.
25277///
25278/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
25279/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
25280/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
25281///
25282/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
25283///
25284/// \return true if it matches.
25285static bool findBuildAggregate(Instruction *LastInsertInst,
25287 SmallVectorImpl<Value *> &BuildVectorOpds,
25288 SmallVectorImpl<Value *> &InsertElts,
25289 const BoUpSLP &R) {
25290
25291 assert((isa<InsertElementInst>(LastInsertInst) ||
25292 isa<InsertValueInst>(LastInsertInst)) &&
25293 "Expected insertelement or insertvalue instruction!");
25294
25295 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
25296 "Expected empty result vectors!");
25297
25298 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
25299 if (!AggregateSize)
25300 return false;
25301 BuildVectorOpds.resize(*AggregateSize);
25302 InsertElts.resize(*AggregateSize);
25303
25304 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
25305 llvm::erase(BuildVectorOpds, nullptr);
25306 llvm::erase(InsertElts, nullptr);
25307 if (BuildVectorOpds.size() >= 2)
25308 return true;
25309
25310 return false;
25311}
25312
25313/// Try and get a reduction instruction from a phi node.
25314///
25315/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
25316/// if they come from either \p ParentBB or a containing loop latch.
25317///
25318/// \returns A candidate reduction value if possible, or \code nullptr \endcode
25319/// if not possible.
25321 BasicBlock *ParentBB, LoopInfo *LI) {
25322 // There are situations where the reduction value is not dominated by the
25323 // reduction phi. Vectorizing such cases has been reported to cause
25324 // miscompiles. See PR25787.
25325 auto DominatedReduxValue = [&](Value *R) {
25326 return isa<Instruction>(R) &&
25327 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
25328 };
25329
25330 Instruction *Rdx = nullptr;
25331
25332 // Return the incoming value if it comes from the same BB as the phi node.
25333 if (P->getIncomingBlock(0) == ParentBB) {
25334 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25335 } else if (P->getIncomingBlock(1) == ParentBB) {
25336 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25337 }
25338
25339 if (Rdx && DominatedReduxValue(Rdx))
25340 return Rdx;
25341
25342 // Otherwise, check whether we have a loop latch to look at.
25343 Loop *BBL = LI->getLoopFor(ParentBB);
25344 if (!BBL)
25345 return nullptr;
25346 BasicBlock *BBLatch = BBL->getLoopLatch();
25347 if (!BBLatch)
25348 return nullptr;
25349
25350 // There is a loop latch, return the incoming value if it comes from
25351 // that. This reduction pattern occasionally turns up.
25352 if (P->getIncomingBlock(0) == BBLatch) {
25353 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25354 } else if (P->getIncomingBlock(1) == BBLatch) {
25355 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25356 }
25357
25358 if (Rdx && DominatedReduxValue(Rdx))
25359 return Rdx;
25360
25361 return nullptr;
25362}
25363
25364static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
25365 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
25366 return true;
25367 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
25368 return true;
25369 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
25370 return true;
25371 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
25372 return true;
25373 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
25374 return true;
25376 return true;
25378 return true;
25380 return true;
25382 return true;
25383 return false;
25384}
25385
25386/// We could have an initial reduction that is not an add.
25387/// r *= v1 + v2 + v3 + v4
25388/// In such a case start looking for a tree rooted in the first '+'.
25389/// \Returns the new root if found, which may be nullptr if not an instruction.
25391 Instruction *Root) {
25392 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25393 isa<IntrinsicInst>(Root)) &&
25394 "Expected binop, select, or intrinsic for reduction matching");
25395 Value *LHS =
25396 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25397 Value *RHS =
25398 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25399 if (LHS == Phi)
25400 return dyn_cast<Instruction>(RHS);
25401 if (RHS == Phi)
25402 return dyn_cast<Instruction>(LHS);
25403 return nullptr;
25404}
25405
25406/// \p Returns the first operand of \p I that does not match \p Phi. If
25407/// operand is not an instruction it returns nullptr.
25409 Value *Op0 = nullptr;
25410 Value *Op1 = nullptr;
25411 if (!matchRdxBop(I, Op0, Op1))
25412 return nullptr;
25413 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25414}
25415
25416/// \Returns true if \p I is a candidate instruction for reduction vectorization.
25418 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
25419 Value *B0 = nullptr, *B1 = nullptr;
25420 bool IsBinop = matchRdxBop(I, B0, B1);
25421 return IsBinop || IsSelect;
25422}
25423
25424bool SLPVectorizerPass::vectorizeHorReduction(
25425 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
25426 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25427 if (!ShouldVectorizeHor)
25428 return false;
25429 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
25430
25431 if (Root->getParent() != BB || isa<PHINode>(Root))
25432 return false;
25433
25434 // If we can find a secondary reduction root, use that instead.
25435 auto SelectRoot = [&]() {
25436 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
25437 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25438 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
25439 return NewRoot;
25440 return Root;
25441 };
25442
25443 // Start analysis starting from Root instruction. If horizontal reduction is
25444 // found, try to vectorize it. If it is not a horizontal reduction or
25445 // vectorization is not possible or not effective, and currently analyzed
25446 // instruction is a binary operation, try to vectorize the operands, using
25447 // pre-order DFS traversal order. If the operands were not vectorized, repeat
25448 // the same procedure considering each operand as a possible root of the
25449 // horizontal reduction.
25450 // Interrupt the process if the Root instruction itself was vectorized or all
25451 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
25452 // If a horizintal reduction was not matched or vectorized we collect
25453 // instructions for possible later attempts for vectorization.
25454 std::queue<std::pair<Instruction *, unsigned>> Stack;
25455 Stack.emplace(SelectRoot(), 0);
25456 SmallPtrSet<Value *, 8> VisitedInstrs;
25457 bool Res = false;
25458 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
25459 if (R.isAnalyzedReductionRoot(Inst))
25460 return nullptr;
25461 if (!isReductionCandidate(Inst))
25462 return nullptr;
25463 HorizontalReduction HorRdx;
25464 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25465 return nullptr;
25466 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25467 };
25468 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
25469 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25470 FutureSeed = getNonPhiOperand(Root, P);
25471 if (!FutureSeed)
25472 return false;
25473 }
25474 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
25475 // analysis is done separately.
25477 PostponedInsts.push_back(FutureSeed);
25478 return true;
25479 };
25480
25481 while (!Stack.empty()) {
25482 Instruction *Inst;
25483 unsigned Level;
25484 std::tie(Inst, Level) = Stack.front();
25485 Stack.pop();
25486 // Do not try to analyze instruction that has already been vectorized.
25487 // This may happen when we vectorize instruction operands on a previous
25488 // iteration while stack was populated before that happened.
25489 if (R.isDeleted(Inst))
25490 continue;
25491 if (Value *VectorizedV = TryToReduce(Inst)) {
25492 Res = true;
25493 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
25494 // Try to find another reduction.
25495 Stack.emplace(I, Level);
25496 continue;
25497 }
25498 if (R.isDeleted(Inst))
25499 continue;
25500 } else {
25501 // We could not vectorize `Inst` so try to use it as a future seed.
25502 if (!TryAppendToPostponedInsts(Inst)) {
25503 assert(Stack.empty() && "Expected empty stack");
25504 break;
25505 }
25506 }
25507
25508 // Try to vectorize operands.
25509 // Continue analysis for the instruction from the same basic block only to
25510 // save compile time.
25511 if (++Level < RecursionMaxDepth)
25512 for (auto *Op : Inst->operand_values())
25513 if (VisitedInstrs.insert(Op).second)
25514 if (auto *I = dyn_cast<Instruction>(Op))
25515 // Do not try to vectorize CmpInst operands, this is done
25516 // separately.
25518 !R.isDeleted(I) && I->getParent() == BB)
25519 Stack.emplace(I, Level);
25520 }
25521 return Res;
25522}
25523
25524bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
25525 if (!I)
25526 return false;
25527
25528 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
25529 return false;
25530 // Skip potential FMA candidates.
25531 if ((I->getOpcode() == Instruction::FAdd ||
25532 I->getOpcode() == Instruction::FSub) &&
25533 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
25534 .isValid())
25535 return false;
25536
25537 Value *P = I->getParent();
25538
25539 // Vectorize in current basic block only.
25540 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
25541 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
25542 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
25543 R.isDeleted(Op0) || R.isDeleted(Op1))
25544 return false;
25545
25546 // First collect all possible candidates
25548 Candidates.emplace_back(Op0, Op1);
25549
25550 auto *A = dyn_cast<BinaryOperator>(Op0);
25551 auto *B = dyn_cast<BinaryOperator>(Op1);
25552 // Try to skip B.
25553 if (A && B && B->hasOneUse()) {
25554 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
25555 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
25556 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
25557 Candidates.emplace_back(A, B0);
25558 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
25559 Candidates.emplace_back(A, B1);
25560 }
25561 // Try to skip A.
25562 if (B && A && A->hasOneUse()) {
25563 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
25564 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
25565 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
25566 Candidates.emplace_back(A0, B);
25567 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
25568 Candidates.emplace_back(A1, B);
25569 }
25570
25571 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
25573 if (!isReductionCandidate(Inst))
25574 return false;
25575 Type *Ty = Inst->getType();
25576 if (!isValidElementType(Ty) || Ty->isPointerTy())
25577 return false;
25578 HorizontalReduction HorRdx(Inst, Ops);
25579 if (!HorRdx.matchReductionForOperands())
25580 return false;
25581 // Check the cost of operations.
25582 VectorType *VecTy = getWidenedType(Ty, Ops.size());
25584 InstructionCost ScalarCost =
25585 TTI.getScalarizationOverhead(
25586 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
25587 /*Extract=*/true, CostKind) +
25588 TTI.getInstructionCost(Inst, CostKind);
25589 InstructionCost RedCost;
25590 switch (::getRdxKind(Inst)) {
25591 case RecurKind::Add:
25592 case RecurKind::Mul:
25593 case RecurKind::Or:
25594 case RecurKind::And:
25595 case RecurKind::Xor:
25596 case RecurKind::FAdd:
25597 case RecurKind::FMul: {
25598 FastMathFlags FMF;
25599 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
25600 FMF = FPCI->getFastMathFlags();
25601 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25602 CostKind);
25603 break;
25604 }
25605 default:
25606 return false;
25607 }
25608 if (RedCost >= ScalarCost)
25609 return false;
25610
25611 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
25612 };
25613 if (Candidates.size() == 1)
25614 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
25615
25616 // We have multiple options. Try to pick the single best.
25617 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
25618 if (!BestCandidate)
25619 return false;
25620 return (*BestCandidate == 0 &&
25621 TryToReduce(I, {Candidates[*BestCandidate].first,
25622 Candidates[*BestCandidate].second})) ||
25623 tryToVectorizeList({Candidates[*BestCandidate].first,
25624 Candidates[*BestCandidate].second},
25625 R);
25626}
25627
25628bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
25629 BasicBlock *BB, BoUpSLP &R) {
25630 SmallVector<WeakTrackingVH> PostponedInsts;
25631 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
25632 Res |= tryToVectorize(PostponedInsts, R);
25633 return Res;
25634}
25635
25636bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
25637 BoUpSLP &R) {
25638 bool Res = false;
25639 for (Value *V : Insts)
25640 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
25641 Res |= tryToVectorize(Inst, R);
25642 return Res;
25643}
25644
25645bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25646 BasicBlock *BB, BoUpSLP &R,
25647 bool MaxVFOnly) {
25648 if (!R.canMapToVector(IVI->getType()))
25649 return false;
25650
25651 SmallVector<Value *, 16> BuildVectorOpds;
25652 SmallVector<Value *, 16> BuildVectorInsts;
25653 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
25654 return false;
25655
25656 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
25657 R.getORE()->emit([&]() {
25658 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
25659 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
25660 "trying reduction first.";
25661 });
25662 return false;
25663 }
25664 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
25665 // Aggregate value is unlikely to be processed in vector register.
25666 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25667}
25668
25669bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25670 BasicBlock *BB, BoUpSLP &R,
25671 bool MaxVFOnly) {
25672 SmallVector<Value *, 16> BuildVectorInsts;
25673 SmallVector<Value *, 16> BuildVectorOpds;
25674 SmallVector<int> Mask;
25675 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
25677 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
25678 return false;
25679
25680 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
25681 R.getORE()->emit([&]() {
25682 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
25683 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
25684 "trying reduction first.";
25685 });
25686 return false;
25687 }
25688 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
25689 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25690}
25691
25692template <typename T>
25694 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
25695 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
25696 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
25697 bool MaxVFOnly, BoUpSLP &R) {
25698 bool Changed = false;
25699 // Sort by type, parent, operands.
25700 stable_sort(Incoming, Comparator);
25701
25702 // Try to vectorize elements base on their type.
25703 SmallVector<T *> Candidates;
25705 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
25706 VL.clear()) {
25707 // Look for the next elements with the same type, parent and operand
25708 // kinds.
25709 auto *I = dyn_cast<Instruction>(*IncIt);
25710 if (!I || R.isDeleted(I)) {
25711 ++IncIt;
25712 continue;
25713 }
25714 auto *SameTypeIt = IncIt;
25715 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
25716 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25717 AreCompatible(VL, *SameTypeIt))) {
25718 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25719 ++SameTypeIt;
25720 if (I && !R.isDeleted(I))
25721 VL.push_back(cast<T>(I));
25722 }
25723
25724 // Try to vectorize them.
25725 unsigned NumElts = VL.size();
25726 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
25727 << NumElts << ")\n");
25728 // The vectorization is a 3-state attempt:
25729 // 1. Try to vectorize instructions with the same/alternate opcodes with the
25730 // size of maximal register at first.
25731 // 2. Try to vectorize remaining instructions with the same type, if
25732 // possible. This may result in the better vectorization results rather than
25733 // if we try just to vectorize instructions with the same/alternate opcodes.
25734 // 3. Final attempt to try to vectorize all instructions with the
25735 // same/alternate ops only, this may result in some extra final
25736 // vectorization.
25737 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
25738 // Success start over because instructions might have been changed.
25739 Changed = true;
25740 VL.swap(Candidates);
25741 Candidates.clear();
25742 for (T *V : VL) {
25743 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25744 Candidates.push_back(V);
25745 }
25746 } else {
25747 /// \Returns the minimum number of elements that we will attempt to
25748 /// vectorize.
25749 auto GetMinNumElements = [&R](Value *V) {
25750 unsigned EltSize = R.getVectorElementSize(V);
25751 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25752 };
25753 if (NumElts < GetMinNumElements(*IncIt) &&
25754 (Candidates.empty() ||
25755 Candidates.front()->getType() == (*IncIt)->getType())) {
25756 for (T *V : VL) {
25757 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25758 Candidates.push_back(V);
25759 }
25760 }
25761 }
25762 // Final attempt to vectorize instructions with the same types.
25763 if (Candidates.size() > 1 &&
25764 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25765 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
25766 // Success start over because instructions might have been changed.
25767 Changed = true;
25768 } else if (MaxVFOnly) {
25769 // Try to vectorize using small vectors.
25771 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
25772 VL.clear()) {
25773 auto *I = dyn_cast<Instruction>(*It);
25774 if (!I || R.isDeleted(I)) {
25775 ++It;
25776 continue;
25777 }
25778 auto *SameTypeIt = It;
25779 while (SameTypeIt != End &&
25780 (!isa<Instruction>(*SameTypeIt) ||
25781 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25782 AreCompatible(*SameTypeIt, *It))) {
25783 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25784 ++SameTypeIt;
25785 if (I && !R.isDeleted(I))
25786 VL.push_back(cast<T>(I));
25787 }
25788 unsigned NumElts = VL.size();
25789 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
25790 /*MaxVFOnly=*/false))
25791 Changed = true;
25792 It = SameTypeIt;
25793 }
25794 }
25795 Candidates.clear();
25796 }
25797
25798 // Start over at the next instruction of a different type (or the end).
25799 IncIt = SameTypeIt;
25800 }
25801 return Changed;
25802}
25803
25804/// Compare two cmp instructions. If IsCompatibility is true, function returns
25805/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
25806/// operands. If IsCompatibility is false, function implements strict weak
25807/// ordering relation between two cmp instructions, returning true if the first
25808/// instruction is "less" than the second, i.e. its predicate is less than the
25809/// predicate of the second or the operands IDs are less than the operands IDs
25810/// of the second cmp instruction.
25811template <bool IsCompatibility>
25812static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
25813 const DominatorTree &DT) {
25814 assert(isValidElementType(V->getType()) &&
25815 isValidElementType(V2->getType()) &&
25816 "Expected valid element types only.");
25817 if (V == V2)
25818 return IsCompatibility;
25819 auto *CI1 = cast<CmpInst>(V);
25820 auto *CI2 = cast<CmpInst>(V2);
25821 if (CI1->getOperand(0)->getType()->getTypeID() <
25822 CI2->getOperand(0)->getType()->getTypeID())
25823 return !IsCompatibility;
25824 if (CI1->getOperand(0)->getType()->getTypeID() >
25825 CI2->getOperand(0)->getType()->getTypeID())
25826 return false;
25827 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25829 return !IsCompatibility;
25830 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25832 return false;
25833 CmpInst::Predicate Pred1 = CI1->getPredicate();
25834 CmpInst::Predicate Pred2 = CI2->getPredicate();
25837 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
25838 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
25839 if (BasePred1 < BasePred2)
25840 return !IsCompatibility;
25841 if (BasePred1 > BasePred2)
25842 return false;
25843 // Compare operands.
25844 bool CI1Preds = Pred1 == BasePred1;
25845 bool CI2Preds = Pred2 == BasePred1;
25846 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
25847 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
25848 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
25849 if (Op1 == Op2)
25850 continue;
25851 if (Op1->getValueID() < Op2->getValueID())
25852 return !IsCompatibility;
25853 if (Op1->getValueID() > Op2->getValueID())
25854 return false;
25855 if (auto *I1 = dyn_cast<Instruction>(Op1))
25856 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
25857 if (IsCompatibility) {
25858 if (I1->getParent() != I2->getParent())
25859 return false;
25860 } else {
25861 // Try to compare nodes with same parent.
25862 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
25863 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
25864 if (!NodeI1)
25865 return NodeI2 != nullptr;
25866 if (!NodeI2)
25867 return false;
25868 assert((NodeI1 == NodeI2) ==
25869 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
25870 "Different nodes should have different DFS numbers");
25871 if (NodeI1 != NodeI2)
25872 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
25873 }
25874 InstructionsState S = getSameOpcode({I1, I2}, TLI);
25875 if (S && (IsCompatibility || !S.isAltShuffle()))
25876 continue;
25877 if (IsCompatibility)
25878 return false;
25879 if (I1->getOpcode() != I2->getOpcode())
25880 return I1->getOpcode() < I2->getOpcode();
25881 }
25882 }
25883 return IsCompatibility;
25884}
25885
25886template <typename ItT>
25887bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
25888 BasicBlock *BB, BoUpSLP &R) {
25889 bool Changed = false;
25890 // Try to find reductions first.
25891 for (CmpInst *I : CmpInsts) {
25892 if (R.isDeleted(I))
25893 continue;
25894 for (Value *Op : I->operands())
25895 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
25896 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
25897 if (R.isDeleted(I))
25898 break;
25899 }
25900 }
25901 // Try to vectorize operands as vector bundles.
25902 for (CmpInst *I : CmpInsts) {
25903 if (R.isDeleted(I))
25904 continue;
25905 Changed |= tryToVectorize(I, R);
25906 }
25907 // Try to vectorize list of compares.
25908 // Sort by type, compare predicate, etc.
25909 auto CompareSorter = [&](Value *V, Value *V2) {
25910 if (V == V2)
25911 return false;
25912 return compareCmp<false>(V, V2, *TLI, *DT);
25913 };
25914
25915 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
25916 if (VL.empty() || VL.back() == V1)
25917 return true;
25918 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
25919 };
25920
25922 for (Instruction *V : CmpInsts)
25923 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
25924 Vals.push_back(V);
25925 if (Vals.size() <= 1)
25926 return Changed;
25928 Vals, CompareSorter, AreCompatibleCompares,
25929 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
25930 // Exclude possible reductions from other blocks.
25931 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
25932 return any_of(V->users(), [V](User *U) {
25933 auto *Select = dyn_cast<SelectInst>(U);
25934 return Select &&
25935 Select->getParent() != cast<Instruction>(V)->getParent();
25936 });
25937 });
25938 if (ArePossiblyReducedInOtherBlock)
25939 return false;
25940 return tryToVectorizeList(Candidates, R, MaxVFOnly);
25941 },
25942 /*MaxVFOnly=*/true, R);
25943 return Changed;
25944}
25945
25946bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
25947 BasicBlock *BB, BoUpSLP &R) {
25949 "This function only accepts Insert instructions");
25950 bool OpsChanged = false;
25951 SmallVector<WeakTrackingVH> PostponedInsts;
25952 for (auto *I : reverse(Instructions)) {
25953 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
25954 if (R.isDeleted(I) || isa<CmpInst>(I))
25955 continue;
25956 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
25957 OpsChanged |=
25958 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
25959 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
25960 OpsChanged |=
25961 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
25962 }
25963 // pass2 - try to vectorize reductions only
25964 if (R.isDeleted(I))
25965 continue;
25966 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
25967 if (R.isDeleted(I) || isa<CmpInst>(I))
25968 continue;
25969 // pass3 - try to match and vectorize a buildvector sequence.
25970 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
25971 OpsChanged |=
25972 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
25973 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
25974 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
25975 /*MaxVFOnly=*/false);
25976 }
25977 }
25978 // Now try to vectorize postponed instructions.
25979 OpsChanged |= tryToVectorize(PostponedInsts, R);
25980
25981 Instructions.clear();
25982 return OpsChanged;
25983}
25984
25985bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
25986 bool Changed = false;
25987 SmallVector<Value *, 4> Incoming;
25988 SmallPtrSet<Value *, 16> VisitedInstrs;
25989 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
25990 // node. Allows better to identify the chains that can be vectorized in the
25991 // better way.
25992 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
25993 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
25995 isValidElementType(V2->getType()) &&
25996 "Expected vectorizable types only.");
25997 if (V1 == V2)
25998 return false;
25999 // It is fine to compare type IDs here, since we expect only vectorizable
26000 // types, like ints, floats and pointers, we don't care about other type.
26001 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
26002 return true;
26003 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
26004 return false;
26005 if (V1->getType()->getScalarSizeInBits() <
26006 V2->getType()->getScalarSizeInBits())
26007 return true;
26008 if (V1->getType()->getScalarSizeInBits() >
26009 V2->getType()->getScalarSizeInBits())
26010 return false;
26011 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26012 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26013 if (Opcodes1.size() < Opcodes2.size())
26014 return true;
26015 if (Opcodes1.size() > Opcodes2.size())
26016 return false;
26017 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26018 {
26019 // Instructions come first.
26020 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
26021 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
26022 if (I1 && I2) {
26023 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
26024 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
26025 if (!NodeI1)
26026 return NodeI2 != nullptr;
26027 if (!NodeI2)
26028 return false;
26029 assert((NodeI1 == NodeI2) ==
26030 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26031 "Different nodes should have different DFS numbers");
26032 if (NodeI1 != NodeI2)
26033 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26034 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
26035 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
26036 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
26037 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
26038 if (!E1 || !E2)
26039 continue;
26040
26041 // Sort on ExtractElementInsts primarily by vector operands. Prefer
26042 // program order of the vector operands.
26043 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
26044 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
26045 if (V1 != V2) {
26046 if (V1 && !V2)
26047 return true;
26048 if (!V1 && V2)
26049 return false;
26051 DT->getNode(V1->getParent());
26053 DT->getNode(V2->getParent());
26054 if (!NodeI1)
26055 return NodeI2 != nullptr;
26056 if (!NodeI2)
26057 return false;
26058 assert((NodeI1 == NodeI2) ==
26059 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26060 "Different nodes should have different DFS numbers");
26061 if (NodeI1 != NodeI2)
26062 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26063 return V1->comesBefore(V2);
26064 }
26065 // If we have the same vector operand, try to sort by constant
26066 // index.
26067 std::optional<unsigned> Id1 = getExtractIndex(E1);
26068 std::optional<unsigned> Id2 = getExtractIndex(E2);
26069 // Bring constants to the top
26070 if (Id1 && !Id2)
26071 return true;
26072 if (!Id1 && Id2)
26073 return false;
26074 // First elements come first.
26075 if (Id1 && Id2)
26076 return *Id1 < *Id2;
26077
26078 continue;
26079 }
26080 if (I1->getOpcode() == I2->getOpcode())
26081 continue;
26082 return I1->getOpcode() < I2->getOpcode();
26083 }
26084 if (I1)
26085 return true;
26086 if (I2)
26087 return false;
26088 }
26089 {
26090 // Non-undef constants come next.
26091 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
26092 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
26093 if (C1 && C2)
26094 continue;
26095 if (C1)
26096 return true;
26097 if (C2)
26098 return false;
26099 }
26100 bool U1 = isa<UndefValue>(Opcodes1[I]);
26101 bool U2 = isa<UndefValue>(Opcodes2[I]);
26102 {
26103 // Non-constant non-instructions come next.
26104 if (!U1 && !U2) {
26105 auto ValID1 = Opcodes1[I]->getValueID();
26106 auto ValID2 = Opcodes2[I]->getValueID();
26107 if (ValID1 == ValID2)
26108 continue;
26109 if (ValID1 < ValID2)
26110 return true;
26111 if (ValID1 > ValID2)
26112 return false;
26113 }
26114 if (!U1)
26115 return true;
26116 if (!U2)
26117 return false;
26118 }
26119 // Undefs come last.
26120 assert(U1 && U2 && "The only thing left should be undef & undef.");
26121 }
26122 return false;
26123 };
26124 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
26125 Value *V1) {
26126 if (VL.empty() || V1 == VL.back())
26127 return true;
26128 Value *V2 = VL.back();
26129 if (V1->getType() != V2->getType())
26130 return false;
26131 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26132 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26133 if (Opcodes1.size() != Opcodes2.size())
26134 return false;
26135 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26136 // Undefs are compatible with any other value.
26137 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
26138 continue;
26139 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
26140 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
26141 if (R.isDeleted(I1) || R.isDeleted(I2))
26142 return false;
26143 if (I1->getParent() != I2->getParent())
26144 return false;
26145 if (getSameOpcode({I1, I2}, *TLI))
26146 continue;
26147 return false;
26148 }
26149 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
26150 continue;
26151 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
26152 return false;
26153 }
26154 return true;
26155 };
26156
26157 bool HaveVectorizedPhiNodes = false;
26158 do {
26159 // Collect the incoming values from the PHIs.
26160 Incoming.clear();
26161 for (Instruction &I : *BB) {
26162 auto *P = dyn_cast<PHINode>(&I);
26163 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
26164 break;
26165
26166 // No need to analyze deleted, vectorized and non-vectorizable
26167 // instructions.
26168 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
26169 isValidElementType(P->getType()))
26170 Incoming.push_back(P);
26171 }
26172
26173 if (Incoming.size() <= 1)
26174 break;
26175
26176 // Find the corresponding non-phi nodes for better matching when trying to
26177 // build the tree.
26178 for (Value *V : Incoming) {
26179 SmallVectorImpl<Value *> &Opcodes =
26180 PHIToOpcodes.try_emplace(V).first->getSecond();
26181 if (!Opcodes.empty())
26182 continue;
26183 SmallVector<Value *, 4> Nodes(1, V);
26184 SmallPtrSet<Value *, 4> Visited;
26185 while (!Nodes.empty()) {
26186 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
26187 if (!Visited.insert(PHI).second)
26188 continue;
26189 for (Value *V : PHI->incoming_values()) {
26190 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
26191 Nodes.push_back(PHI1);
26192 continue;
26193 }
26194 Opcodes.emplace_back(V);
26195 }
26196 }
26197 }
26198
26199 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26200 Incoming, PHICompare, AreCompatiblePHIs,
26201 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26202 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26203 },
26204 /*MaxVFOnly=*/true, R);
26205 Changed |= HaveVectorizedPhiNodes;
26206 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
26207 auto *PHI = dyn_cast<PHINode>(P.first);
26208 return !PHI || R.isDeleted(PHI);
26209 }))
26210 PHIToOpcodes.clear();
26211 VisitedInstrs.insert_range(Incoming);
26212 } while (HaveVectorizedPhiNodes);
26213
26214 VisitedInstrs.clear();
26215
26216 InstSetVector PostProcessInserts;
26217 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26218 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
26219 // also vectorizes `PostProcessCmps`.
26220 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
26221 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26222 if (VectorizeCmps) {
26223 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
26224 PostProcessCmps.clear();
26225 }
26226 PostProcessInserts.clear();
26227 return Changed;
26228 };
26229 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
26230 auto IsInPostProcessInstrs = [&](Instruction *I) {
26231 if (auto *Cmp = dyn_cast<CmpInst>(I))
26232 return PostProcessCmps.contains(Cmp);
26234 PostProcessInserts.contains(I);
26235 };
26236 // Returns true if `I` is an instruction without users, like terminator, or
26237 // function call with ignored return value, store. Ignore unused instructions
26238 // (basing on instruction type, except for CallInst and InvokeInst).
26239 auto HasNoUsers = [](Instruction *I) {
26240 return I->use_empty() &&
26241 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
26242 };
26243 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
26244 // Skip instructions with scalable type. The num of elements is unknown at
26245 // compile-time for scalable type.
26246 if (isa<ScalableVectorType>(It->getType()))
26247 continue;
26248
26249 // Skip instructions marked for the deletion.
26250 if (R.isDeleted(&*It))
26251 continue;
26252 // We may go through BB multiple times so skip the one we have checked.
26253 if (!VisitedInstrs.insert(&*It).second) {
26254 if (HasNoUsers(&*It) &&
26255 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
26256 // We would like to start over since some instructions are deleted
26257 // and the iterator may become invalid value.
26258 Changed = true;
26259 It = BB->begin();
26260 E = BB->end();
26261 }
26262 continue;
26263 }
26264
26265 // Try to vectorize reductions that use PHINodes.
26266 if (PHINode *P = dyn_cast<PHINode>(It)) {
26267 // Check that the PHI is a reduction PHI.
26268 if (P->getNumIncomingValues() == 2) {
26269 // Try to match and vectorize a horizontal reduction.
26270 Instruction *Root = getReductionInstr(DT, P, BB, LI);
26271 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
26272 Changed = true;
26273 It = BB->begin();
26274 E = BB->end();
26275 continue;
26276 }
26277 }
26278 // Try to vectorize the incoming values of the PHI, to catch reductions
26279 // that feed into PHIs.
26280 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
26281 // Skip if the incoming block is the current BB for now. Also, bypass
26282 // unreachable IR for efficiency and to avoid crashing.
26283 // TODO: Collect the skipped incoming values and try to vectorize them
26284 // after processing BB.
26285 if (BB == P->getIncomingBlock(I) ||
26286 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
26287 continue;
26288
26289 // Postponed instructions should not be vectorized here, delay their
26290 // vectorization.
26291 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
26292 PI && !IsInPostProcessInstrs(PI)) {
26293 bool Res =
26294 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
26295 Changed |= Res;
26296 if (Res && R.isDeleted(P)) {
26297 It = BB->begin();
26298 E = BB->end();
26299 break;
26300 }
26301 }
26302 }
26303 continue;
26304 }
26305
26306 if (HasNoUsers(&*It)) {
26307 bool OpsChanged = false;
26308 auto *SI = dyn_cast<StoreInst>(It);
26309 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
26310 if (SI) {
26311 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
26312 // Try to vectorize chain in store, if this is the only store to the
26313 // address in the block.
26314 // TODO: This is just a temporarily solution to save compile time. Need
26315 // to investigate if we can safely turn on slp-vectorize-hor-store
26316 // instead to allow lookup for reduction chains in all non-vectorized
26317 // stores (need to check side effects and compile time).
26318 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
26319 SI->getValueOperand()->hasOneUse();
26320 }
26321 if (TryToVectorizeRoot) {
26322 for (auto *V : It->operand_values()) {
26323 // Postponed instructions should not be vectorized here, delay their
26324 // vectorization.
26325 if (auto *VI = dyn_cast<Instruction>(V);
26326 VI && !IsInPostProcessInstrs(VI))
26327 // Try to match and vectorize a horizontal reduction.
26328 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
26329 }
26330 }
26331 // Start vectorization of post-process list of instructions from the
26332 // top-tree instructions to try to vectorize as many instructions as
26333 // possible.
26334 OpsChanged |=
26335 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
26336 if (OpsChanged) {
26337 // We would like to start over since some instructions are deleted
26338 // and the iterator may become invalid value.
26339 Changed = true;
26340 It = BB->begin();
26341 E = BB->end();
26342 continue;
26343 }
26344 }
26345
26347 PostProcessInserts.insert(&*It);
26348 else if (isa<CmpInst>(It))
26349 PostProcessCmps.insert(cast<CmpInst>(&*It));
26350 }
26351
26352 return Changed;
26353}
26354
26355bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
26356 auto Changed = false;
26357 for (auto &Entry : GEPs) {
26358 // If the getelementptr list has fewer than two elements, there's nothing
26359 // to do.
26360 if (Entry.second.size() < 2)
26361 continue;
26362
26363 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
26364 << Entry.second.size() << ".\n");
26365
26366 // Process the GEP list in chunks suitable for the target's supported
26367 // vector size. If a vector register can't hold 1 element, we are done. We
26368 // are trying to vectorize the index computations, so the maximum number of
26369 // elements is based on the size of the index expression, rather than the
26370 // size of the GEP itself (the target's pointer size).
26371 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
26372 return !R.isDeleted(GEP);
26373 });
26374 if (It == Entry.second.end())
26375 continue;
26376 unsigned MaxVecRegSize = R.getMaxVecRegSize();
26377 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
26378 if (MaxVecRegSize < EltSize)
26379 continue;
26380
26381 unsigned MaxElts = MaxVecRegSize / EltSize;
26382 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
26383 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26384 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
26385
26386 // Initialize a set a candidate getelementptrs. Note that we use a
26387 // SetVector here to preserve program order. If the index computations
26388 // are vectorizable and begin with loads, we want to minimize the chance
26389 // of having to reorder them later.
26390 SetVector<Value *> Candidates(llvm::from_range, GEPList);
26391
26392 // Some of the candidates may have already been vectorized after we
26393 // initially collected them or their index is optimized to constant value.
26394 // If so, they are marked as deleted, so remove them from the set of
26395 // candidates.
26396 Candidates.remove_if([&R](Value *I) {
26397 return R.isDeleted(cast<Instruction>(I)) ||
26398 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
26399 });
26400
26401 // Remove from the set of candidates all pairs of getelementptrs with
26402 // constant differences. Such getelementptrs are likely not good
26403 // candidates for vectorization in a bottom-up phase since one can be
26404 // computed from the other. We also ensure all candidate getelementptr
26405 // indices are unique.
26406 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
26407 auto *GEPI = GEPList[I];
26408 if (!Candidates.count(GEPI))
26409 continue;
26410 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
26411 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
26412 auto *GEPJ = GEPList[J];
26413 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26414 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
26415 Candidates.remove(GEPI);
26416 Candidates.remove(GEPJ);
26417 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26418 Candidates.remove(GEPJ);
26419 }
26420 }
26421 }
26422
26423 // We break out of the above computation as soon as we know there are
26424 // fewer than two candidates remaining.
26425 if (Candidates.size() < 2)
26426 continue;
26427
26428 // Add the single, non-constant index of each candidate to the bundle. We
26429 // ensured the indices met these constraints when we originally collected
26430 // the getelementptrs.
26431 SmallVector<Value *, 16> Bundle(Candidates.size());
26432 auto BundleIndex = 0u;
26433 for (auto *V : Candidates) {
26434 auto *GEP = cast<GetElementPtrInst>(V);
26435 auto *GEPIdx = GEP->idx_begin()->get();
26436 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26437 Bundle[BundleIndex++] = GEPIdx;
26438 }
26439
26440 // Try and vectorize the indices. We are currently only interested in
26441 // gather-like cases of the form:
26442 //
26443 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
26444 //
26445 // where the loads of "a", the loads of "b", and the subtractions can be
26446 // performed in parallel. It's likely that detecting this pattern in a
26447 // bottom-up phase will be simpler and less costly than building a
26448 // full-blown top-down phase beginning at the consecutive loads.
26449 Changed |= tryToVectorizeList(Bundle, R);
26450 }
26451 }
26452 return Changed;
26453}
26454
26455bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
26456 bool Changed = false;
26457 // Sort by type, base pointers and values operand. Value operands must be
26458 // compatible (have the same opcode, same parent), otherwise it is
26459 // definitely not profitable to try to vectorize them.
26460 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
26461 if (V->getValueOperand()->getType()->getTypeID() <
26462 V2->getValueOperand()->getType()->getTypeID())
26463 return true;
26464 if (V->getValueOperand()->getType()->getTypeID() >
26465 V2->getValueOperand()->getType()->getTypeID())
26466 return false;
26467 if (V->getPointerOperandType()->getTypeID() <
26468 V2->getPointerOperandType()->getTypeID())
26469 return true;
26470 if (V->getPointerOperandType()->getTypeID() >
26471 V2->getPointerOperandType()->getTypeID())
26472 return false;
26473 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
26474 V2->getValueOperand()->getType()->getScalarSizeInBits())
26475 return true;
26476 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
26477 V2->getValueOperand()->getType()->getScalarSizeInBits())
26478 return false;
26479 // UndefValues are compatible with all other values.
26480 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
26481 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26482 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26483 DT->getNode(I1->getParent());
26484 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26485 DT->getNode(I2->getParent());
26486 assert(NodeI1 && "Should only process reachable instructions");
26487 assert(NodeI2 && "Should only process reachable instructions");
26488 assert((NodeI1 == NodeI2) ==
26489 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26490 "Different nodes should have different DFS numbers");
26491 if (NodeI1 != NodeI2)
26492 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26493 return I1->getOpcode() < I2->getOpcode();
26494 }
26495 return V->getValueOperand()->getValueID() <
26496 V2->getValueOperand()->getValueID();
26497 };
26498
26499 bool SameParent = true;
26500 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
26501 if (VL.empty()) {
26502 SameParent = true;
26503 return true;
26504 }
26505 StoreInst *V2 = VL.back();
26506 if (V1 == V2)
26507 return true;
26508 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
26509 return false;
26510 if (V1->getPointerOperandType() != V2->getPointerOperandType())
26511 return false;
26512 // Undefs are compatible with any other value.
26513 if (isa<UndefValue>(V1->getValueOperand()) ||
26515 return true;
26516 if (isa<Constant>(V1->getValueOperand()) &&
26518 return true;
26519 // Check if the operands of the stores can be vectorized. They can be
26520 // vectorized, if they have compatible operands or have operands, which can
26521 // be vectorized as copyables.
26522 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
26523 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
26524 if (I1 || I2) {
26525 // Accept only tail-following non-compatible values for now.
26526 // TODO: investigate if it is possible to vectorize incompatible values,
26527 // if the copyables are first in the list.
26528 if (I1 && !I2)
26529 return false;
26530 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
26531 SmallVector<Value *> NewVL(VL.size() + 1);
26532 for (auto [SI, V] : zip(VL, NewVL))
26533 V = SI->getValueOperand();
26534 NewVL.back() = V1->getValueOperand();
26535 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
26536 InstructionsState S = Analysis.buildInstructionsState(
26537 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
26538 /*SkipSameCodeCheck=*/!SameParent);
26539 if (S)
26540 return true;
26541 if (!SameParent)
26542 return false;
26543 }
26544 return V1->getValueOperand()->getValueID() ==
26545 V2->getValueOperand()->getValueID();
26546 };
26547
26548 // Attempt to sort and vectorize each of the store-groups.
26549 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26550 for (auto &Pair : Stores) {
26551 if (Pair.second.size() < 2)
26552 continue;
26553
26554 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
26555 << Pair.second.size() << ".\n");
26556
26557 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
26558 continue;
26559
26560 // Reverse stores to do bottom-to-top analysis. This is important if the
26561 // values are stores to the same addresses several times, in this case need
26562 // to follow the stores order (reversed to meet the memory dependecies).
26563 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
26564 Pair.second.rend());
26566 ReversedStores, StoreSorter, AreCompatibleStores,
26567 [&](ArrayRef<StoreInst *> Candidates, bool) {
26568 return vectorizeStores(Candidates, R, Attempted);
26569 },
26570 /*MaxVFOnly=*/false, R);
26571 }
26572 return Changed;
26573}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
Definition ExpandFp.cpp:992
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void negate()
Negate this APInt in place.
Definition APInt.h:1468
unsigned logBase2() const
Definition APInt.h:1761
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
const T & back() const
back - Get the last element.
Definition ArrayRef.h:156
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:224
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:200
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:162
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:480
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:707
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:666
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:702
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:706
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:829
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:791
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
static bool shouldExecute(unsigned CounterName)
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getUnknown()
Definition DebugLoc.h:162
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
bool erase(const KeyT &Val)
Definition DenseMap.h:311
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:163
iterator end()
Definition DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:213
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:158
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2571
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2637
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2204
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2593
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2277
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2439
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:149
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:111
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
T & front() const
front - Get the first element.
Definition ArrayRef.h:354
iterator end() const
Definition ArrayRef.h:348
iterator begin() const
Definition ArrayRef.h:347
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:90
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:131
void insert_range(Range &&R)
Definition SetVector.h:175
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:93
void clear()
Completely clear the SetVector.
Definition SetVector.h:266
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:99
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:251
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:338
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:228
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:181
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:284
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
iterator_range< value_op_iterator > operand_values()
Definition User.h:316
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2038
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1698
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
InstructionCost Cost
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1725
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:733
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2211
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:715
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:677
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:1961
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:314
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:95
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2108
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1948
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1624
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1743
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:431
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:670
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:336
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1399
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1900
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ None
Not a recurrence.
@ Xor
Bitwise or logical XOR of integers.
@ FMul
Product of floats.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1934
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2010
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1815
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1409
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2068
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:831
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:257
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1427
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1436
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const