Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
20#include <cmath>
21#include <optional>
22using namespace llvm;
23using namespace llvm::PatternMatch;
24
25#define DEBUG_TYPE "riscvtti"
26
28 "riscv-v-register-bit-width-lmul",
30 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
31 "by autovectorized code. Fractional LMULs are not supported."),
33
35 "riscv-v-slp-max-vf",
37 "Overrides result used for getMaximumVF query which is used "
38 "exclusively by SLP vectorizer."),
40
42 RVVMinTripCount("riscv-v-min-trip-count",
43 cl::desc("Set the lower bound of a trip count to decide on "
44 "vectorization while tail-folding."),
46
48RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
50 // Check if the type is valid for all CostKind
51 if (!VT.isVector())
53 size_t NumInstr = OpCodes.size();
55 return NumInstr;
56 InstructionCost LMULCost = TLI->getLMULCost(VT);
58 return LMULCost * NumInstr;
59 InstructionCost Cost = 0;
60 for (auto Op : OpCodes) {
61 switch (Op) {
62 case RISCV::VRGATHER_VI:
63 Cost += TLI->getVRGatherVICost(VT);
64 break;
65 case RISCV::VRGATHER_VV:
66 Cost += TLI->getVRGatherVVCost(VT);
67 break;
68 case RISCV::VSLIDEUP_VI:
69 case RISCV::VSLIDEDOWN_VI:
70 Cost += TLI->getVSlideVICost(VT);
71 break;
72 case RISCV::VSLIDEUP_VX:
73 case RISCV::VSLIDEDOWN_VX:
74 Cost += TLI->getVSlideVXCost(VT);
75 break;
76 case RISCV::VREDMAX_VS:
77 case RISCV::VREDMIN_VS:
78 case RISCV::VREDMAXU_VS:
79 case RISCV::VREDMINU_VS:
80 case RISCV::VREDSUM_VS:
81 case RISCV::VREDAND_VS:
82 case RISCV::VREDOR_VS:
83 case RISCV::VREDXOR_VS:
84 case RISCV::VFREDMAX_VS:
85 case RISCV::VFREDMIN_VS:
86 case RISCV::VFREDUSUM_VS: {
87 unsigned VL = VT.getVectorMinNumElements();
88 if (!VT.isFixedLengthVector())
89 VL *= *getVScaleForTuning();
90 Cost += Log2_32_Ceil(VL);
91 break;
92 }
93 case RISCV::VFREDOSUM_VS: {
94 unsigned VL = VT.getVectorMinNumElements();
95 if (!VT.isFixedLengthVector())
96 VL *= *getVScaleForTuning();
97 Cost += VL;
98 break;
99 }
100 case RISCV::VMV_X_S:
101 case RISCV::VMV_S_X:
102 case RISCV::VFMV_F_S:
103 case RISCV::VFMV_S_F:
104 case RISCV::VMOR_MM:
105 case RISCV::VMXOR_MM:
106 case RISCV::VMAND_MM:
107 case RISCV::VMANDN_MM:
108 case RISCV::VMNAND_MM:
109 case RISCV::VCPOP_M:
110 case RISCV::VFIRST_M:
111 Cost += 1;
112 break;
113 default:
114 Cost += LMULCost;
115 }
116 }
117 return Cost;
118}
119
121 const RISCVSubtarget *ST,
122 const APInt &Imm, Type *Ty,
124 bool FreeZeroes) {
125 assert(Ty->isIntegerTy() &&
126 "getIntImmCost can only estimate cost of materialising integers");
127
128 // We have a Zero register, so 0 is always free.
129 if (Imm == 0)
130 return TTI::TCC_Free;
131
132 // Otherwise, we check how many instructions it will take to materialise.
133 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
134 /*CompressionCost=*/false, FreeZeroes);
135}
136
140 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
141}
142
143// Look for patterns of shift followed by AND that can be turned into a pair of
144// shifts. We won't need to materialize an immediate for the AND so these can
145// be considered free.
146static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
147 uint64_t Mask = Imm.getZExtValue();
148 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
149 if (!BO || !BO->hasOneUse())
150 return false;
151
152 if (BO->getOpcode() != Instruction::Shl)
153 return false;
154
155 if (!isa<ConstantInt>(BO->getOperand(1)))
156 return false;
157
158 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
159 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
160 // is a mask shifted by c2 bits with c3 leading zeros.
161 if (isShiftedMask_64(Mask)) {
162 unsigned Trailing = llvm::countr_zero(Mask);
163 if (ShAmt == Trailing)
164 return true;
165 }
166
167 return false;
168}
169
171 const APInt &Imm, Type *Ty,
173 Instruction *Inst) const {
174 assert(Ty->isIntegerTy() &&
175 "getIntImmCost can only estimate cost of materialising integers");
176
177 // We have a Zero register, so 0 is always free.
178 if (Imm == 0)
179 return TTI::TCC_Free;
180
181 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
182 // commutative, in others the immediate comes from a specific argument index.
183 bool Takes12BitImm = false;
184 unsigned ImmArgIdx = ~0U;
185
186 switch (Opcode) {
187 case Instruction::GetElementPtr:
188 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
189 // split up large offsets in GEP into better parts than ConstantHoisting
190 // can.
191 return TTI::TCC_Free;
192 case Instruction::Store: {
193 // Use the materialization cost regardless of if it's the address or the
194 // value that is constant, except for if the store is misaligned and
195 // misaligned accesses are not legal (experience shows constant hoisting
196 // can sometimes be harmful in such cases).
197 if (Idx == 1 || !Inst)
198 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
199 /*FreeZeroes=*/true);
200
201 StoreInst *ST = cast<StoreInst>(Inst);
202 if (!getTLI()->allowsMemoryAccessForAlignment(
203 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
204 ST->getPointerAddressSpace(), ST->getAlign()))
205 return TTI::TCC_Free;
206
207 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
208 /*FreeZeroes=*/true);
209 }
210 case Instruction::Load:
211 // If the address is a constant, use the materialization cost.
212 return getIntImmCost(Imm, Ty, CostKind);
213 case Instruction::And:
214 // zext.h
215 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
216 return TTI::TCC_Free;
217 // zext.w
218 if (Imm == UINT64_C(0xffffffff) &&
219 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
220 return TTI::TCC_Free;
221 // bclri
222 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
223 return TTI::TCC_Free;
224 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
225 canUseShiftPair(Inst, Imm))
226 return TTI::TCC_Free;
227 Takes12BitImm = true;
228 break;
229 case Instruction::Add:
230 Takes12BitImm = true;
231 break;
232 case Instruction::Or:
233 case Instruction::Xor:
234 // bseti/binvi
235 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
236 return TTI::TCC_Free;
237 Takes12BitImm = true;
238 break;
239 case Instruction::Mul:
240 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
241 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
242 return TTI::TCC_Free;
243 // One more or less than a power of 2 can use SLLI+ADD/SUB.
244 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
245 return TTI::TCC_Free;
246 // FIXME: There is no MULI instruction.
247 Takes12BitImm = true;
248 break;
249 case Instruction::Sub:
250 case Instruction::Shl:
251 case Instruction::LShr:
252 case Instruction::AShr:
253 Takes12BitImm = true;
254 ImmArgIdx = 1;
255 break;
256 default:
257 break;
258 }
259
260 if (Takes12BitImm) {
261 // Check immediate is the correct argument...
262 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
263 // ... and fits into the 12-bit immediate.
264 if (Imm.getSignificantBits() <= 64 &&
265 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
266 return TTI::TCC_Free;
267 }
268 }
269
270 // Otherwise, use the full materialisation cost.
271 return getIntImmCost(Imm, Ty, CostKind);
272 }
273
274 // By default, prevent hoisting.
275 return TTI::TCC_Free;
276}
277
280 const APInt &Imm, Type *Ty,
282 // Prevent hoisting in unknown cases.
283 return TTI::TCC_Free;
284}
285
287 return ST->hasVInstructions();
288}
289
291RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
292 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
293 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
294}
295
297 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
299 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
301
302 // zve32x is broken for partial_reduce_umla, but let's make sure we
303 // don't generate them.
304 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
305 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
306 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
307 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
309
310 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
311 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
312 // Note: Asuming all vqdot* variants are equal cost
313 return LT.first *
314 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
315}
316
318 // Currently, the ExpandReductions pass can't expand scalable-vector
319 // reductions, but we still request expansion as RVV doesn't support certain
320 // reductions and the SelectionDAG can't legalize them either.
321 switch (II->getIntrinsicID()) {
322 default:
323 return false;
324 // These reductions have no equivalent in RVV
325 case Intrinsic::vector_reduce_mul:
326 case Intrinsic::vector_reduce_fmul:
327 return true;
328 }
329}
330
331std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
332 if (ST->hasVInstructions())
333 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
334 return BaseT::getMaxVScale();
335}
336
337std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
338 if (ST->hasVInstructions())
339 if (unsigned MinVLen = ST->getRealMinVLen();
340 MinVLen >= RISCV::RVVBitsPerBlock)
341 return MinVLen / RISCV::RVVBitsPerBlock;
343}
344
347 unsigned LMUL =
348 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
349 switch (K) {
351 return TypeSize::getFixed(ST->getXLen());
353 return TypeSize::getFixed(
354 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
357 (ST->hasVInstructions() &&
358 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
360 : 0);
361 }
362
363 llvm_unreachable("Unsupported register kind");
364}
365
367RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
369 // Add a cost of address generation + the cost of the load. The address
370 // is expected to be a PC relative offset to a constant pool entry
371 // using auipc/addi.
372 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
373 /*AddressSpace=*/0, CostKind);
374}
375
376static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
377 unsigned Size = Mask.size();
378 if (!isPowerOf2_32(Size))
379 return false;
380 for (unsigned I = 0; I != Size; ++I) {
381 if (static_cast<unsigned>(Mask[I]) == I)
382 continue;
383 if (Mask[I] != 0)
384 return false;
385 if (Size % I != 0)
386 return false;
387 for (unsigned J = I + 1; J != Size; ++J)
388 // Check the pattern is repeated.
389 if (static_cast<unsigned>(Mask[J]) != J % I)
390 return false;
391 SubVectorSize = I;
392 return true;
393 }
394 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
395 return false;
396}
397
399 LLVMContext &C) {
400 assert((DataVT.getScalarSizeInBits() != 8 ||
401 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
402 MVT IndexVT = DataVT.changeTypeToInteger();
403 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
404 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
405 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
406}
407
408/// Attempt to approximate the cost of a shuffle which will require splitting
409/// during legalization. Note that processShuffleMasks is not an exact proxy
410/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
411/// reasonably close upperbound.
413 MVT LegalVT, VectorType *Tp,
414 ArrayRef<int> Mask,
416 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
417 "Expected fixed vector type and non-empty mask");
418 unsigned LegalNumElts = LegalVT.getVectorNumElements();
419 // Number of destination vectors after legalization:
420 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
421 // We are going to permute multiple sources and the result will be in
422 // multiple destinations. Providing an accurate cost only for splits where
423 // the element type remains the same.
424 if (NumOfDests <= 1 ||
426 Tp->getElementType()->getPrimitiveSizeInBits() ||
427 LegalNumElts >= Tp->getElementCount().getFixedValue())
429
430 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
431 unsigned LegalVTSize = LegalVT.getStoreSize();
432 // Number of source vectors after legalization:
433 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
434
435 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
436
437 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
438 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
439 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
440 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
441 assert(NormalizedVF >= Mask.size() &&
442 "Normalized mask expected to be not shorter than original mask.");
443 copy(Mask, NormalizedMask.begin());
444 InstructionCost Cost = 0;
445 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
447 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
448 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
449 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
450 return;
451 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
452 .second)
453 return;
454 Cost += TTI.getShuffleCost(
456 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
457 SingleOpTy, RegMask, CostKind, 0, nullptr);
458 },
459 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
460 Cost += TTI.getShuffleCost(
462 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
463 SingleOpTy, RegMask, CostKind, 0, nullptr);
464 });
465 return Cost;
466}
467
468/// Try to perform better estimation of the permutation.
469/// 1. Split the source/destination vectors into real registers.
470/// 2. Do the mask analysis to identify which real registers are
471/// permuted. If more than 1 source registers are used for the
472/// destination register building, the cost for this destination register
473/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
474/// source register is used, build mask and calculate the cost as a cost
475/// of PermuteSingleSrc.
476/// Also, for the single register permute we try to identify if the
477/// destination register is just a copy of the source register or the
478/// copy of the previous destination register (the cost is
479/// TTI::TCC_Basic). If the source register is just reused, the cost for
480/// this operation is 0.
481static InstructionCost
483 std::optional<unsigned> VLen, VectorType *Tp,
485 assert(LegalVT.isFixedLengthVector());
486 if (!VLen || Mask.empty())
488 MVT ElemVT = LegalVT.getVectorElementType();
489 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
490 LegalVT = TTI.getTypeLegalizationCost(
491 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
492 .second;
493 // Number of destination vectors after legalization:
494 InstructionCost NumOfDests =
495 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
496 if (NumOfDests <= 1 ||
498 Tp->getElementType()->getPrimitiveSizeInBits() ||
499 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
501
502 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
503 unsigned LegalVTSize = LegalVT.getStoreSize();
504 // Number of source vectors after legalization:
505 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
506
507 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
508 LegalVT.getVectorNumElements());
509
510 unsigned E = NumOfDests.getValue();
511 unsigned NormalizedVF =
512 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
513 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
514 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
515 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
516 assert(NormalizedVF >= Mask.size() &&
517 "Normalized mask expected to be not shorter than original mask.");
518 copy(Mask, NormalizedMask.begin());
519 InstructionCost Cost = 0;
520 int NumShuffles = 0;
521 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
523 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
524 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
525 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
526 return;
527 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
528 .second)
529 return;
530 ++NumShuffles;
531 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
532 SingleOpTy, RegMask, CostKind, 0, nullptr);
533 },
534 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
535 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
536 SingleOpTy, RegMask, CostKind, 0, nullptr);
537 NumShuffles += 2;
538 });
539 // Note: check that we do not emit too many shuffles here to prevent code
540 // size explosion.
541 // TODO: investigate, if it can be improved by extra analysis of the masks
542 // to check if the code is more profitable.
543 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
544 (NumOfDestRegs <= 2 && NumShuffles < 4))
545 return Cost;
547}
548
549InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
550 ArrayRef<int> Mask,
552 // Avoid missing masks and length changing shuffles
553 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
555
556 int NumElts = Tp->getNumElements();
557 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
558 // Avoid scalarization cases
559 if (!LT.second.isFixedLengthVector())
561
562 // Requires moving elements between parts, which requires additional
563 // unmodeled instructions.
564 if (LT.first != 1)
566
567 auto GetSlideOpcode = [&](int SlideAmt) {
568 assert(SlideAmt != 0);
569 bool IsVI = isUInt<5>(std::abs(SlideAmt));
570 if (SlideAmt < 0)
571 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
572 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
573 };
574
575 std::array<std::pair<int, int>, 2> SrcInfo;
576 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
578
579 if (SrcInfo[1].second == 0)
580 std::swap(SrcInfo[0], SrcInfo[1]);
581
582 InstructionCost FirstSlideCost = 0;
583 if (SrcInfo[0].second != 0) {
584 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
585 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
586 }
587
588 if (SrcInfo[1].first == -1)
589 return FirstSlideCost;
590
591 InstructionCost SecondSlideCost = 0;
592 if (SrcInfo[1].second != 0) {
593 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
594 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
595 } else {
596 SecondSlideCost =
597 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
598 }
599
600 auto EC = Tp->getElementCount();
601 VectorType *MaskTy =
603 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
604 return FirstSlideCost + SecondSlideCost + MaskCost;
605}
606
609 VectorType *SrcTy, ArrayRef<int> Mask,
610 TTI::TargetCostKind CostKind, int Index,
612 const Instruction *CxtI) const {
613 assert((Mask.empty() || DstTy->isScalableTy() ||
614 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
615 "Expected the Mask to match the return size if given");
616 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
617 "Expected the same scalar types");
618
619 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
620 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
621
622 // First, handle cases where having a fixed length vector enables us to
623 // give a more accurate cost than falling back to generic scalable codegen.
624 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
625 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
626 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
628 *this, LT.second, ST->getRealVLen(),
629 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
630 if (VRegSplittingCost.isValid())
631 return VRegSplittingCost;
632 switch (Kind) {
633 default:
634 break;
636 if (Mask.size() >= 2) {
637 MVT EltTp = LT.second.getVectorElementType();
638 // If the size of the element is < ELEN then shuffles of interleaves and
639 // deinterleaves of 2 vectors can be lowered into the following
640 // sequences
641 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
642 // Example sequence:
643 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
644 // vwaddu.vv v10, v8, v9
645 // li a0, -1 (ignored)
646 // vwmaccu.vx v10, a0, v9
647 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
648 return 2 * LT.first * TLI->getLMULCost(LT.second);
649
650 if (Mask[0] == 0 || Mask[0] == 1) {
651 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
652 // Example sequence:
653 // vnsrl.wi v10, v8, 0
654 if (equal(DeinterleaveMask, Mask))
655 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
656 LT.second, CostKind);
657 }
658 }
659 int SubVectorSize;
660 if (LT.second.getScalarSizeInBits() != 1 &&
661 isRepeatedConcatMask(Mask, SubVectorSize)) {
663 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
664 // The cost of extraction from a subvector is 0 if the index is 0.
665 for (unsigned I = 0; I != NumSlides; ++I) {
666 unsigned InsertIndex = SubVectorSize * (1 << I);
667 FixedVectorType *SubTp =
668 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
669 FixedVectorType *DestTp =
671 std::pair<InstructionCost, MVT> DestLT =
673 // Add the cost of whole vector register move because the
674 // destination vector register group for vslideup cannot overlap the
675 // source.
676 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
677 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
678 CostKind, InsertIndex, SubTp);
679 }
680 return Cost;
681 }
682 }
683
684 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
685 SlideCost.isValid())
686 return SlideCost;
687
688 // vrgather + cost of generating the mask constant.
689 // We model this for an unknown mask with a single vrgather.
690 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
691 LT.second.getVectorNumElements() <= 256)) {
692 VectorType *IdxTy =
693 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
694 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
695 return IndexCost +
696 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
697 }
698 break;
699 }
702
703 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
704 SlideCost.isValid())
705 return SlideCost;
706
707 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
708 // register for the second vrgather. We model this for an unknown
709 // (shuffle) mask.
710 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
711 LT.second.getVectorNumElements() <= 256)) {
712 auto &C = SrcTy->getContext();
713 auto EC = SrcTy->getElementCount();
714 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
716 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
717 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
718 return 2 * IndexCost +
719 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
720 LT.second, CostKind) +
721 MaskCost;
722 }
723 break;
724 }
725 }
726
727 auto shouldSplit = [](TTI::ShuffleKind Kind) {
728 switch (Kind) {
729 default:
730 return false;
734 return true;
735 }
736 };
737
738 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
739 shouldSplit(Kind)) {
740 InstructionCost SplitCost =
741 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
742 if (SplitCost.isValid())
743 return SplitCost;
744 }
745 }
746
747 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
748 switch (Kind) {
749 default:
750 // Fallthrough to generic handling.
751 // TODO: Most of these cases will return getInvalid in generic code, and
752 // must be implemented here.
753 break;
755 // Extract at zero is always a subregister extract
756 if (Index == 0)
757 return TTI::TCC_Free;
758
759 // If we're extracting a subvector of at most m1 size at a sub-register
760 // boundary - which unfortunately we need exact vlen to identify - this is
761 // a subregister extract at worst and thus won't require a vslidedown.
762 // TODO: Extend for aligned m2, m4 subvector extracts
763 // TODO: Extend for misalgined (but contained) extracts
764 // TODO: Extend for scalable subvector types
765 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
766 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
767 if (std::optional<unsigned> VLen = ST->getRealVLen();
768 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
769 SubLT.second.getSizeInBits() <= *VLen)
770 return TTI::TCC_Free;
771 }
772
773 // Example sequence:
774 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
775 // vslidedown.vi v8, v9, 2
776 return LT.first *
777 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
779 // Example sequence:
780 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
781 // vslideup.vi v8, v9, 2
782 LT = getTypeLegalizationCost(DstTy);
783 return LT.first *
784 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
785 case TTI::SK_Select: {
786 // Example sequence:
787 // li a0, 90
788 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
789 // vmv.s.x v0, a0
790 // vmerge.vvm v8, v9, v8, v0
791 // We use 2 for the cost of the mask materialization as this is the true
792 // cost for small masks and most shuffles are small. At worst, this cost
793 // should be a very small constant for the constant pool load. As such,
794 // we may bias towards large selects slightly more than truly warranted.
795 return LT.first *
796 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
797 LT.second, CostKind));
798 }
799 case TTI::SK_Broadcast: {
800 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
801 Instruction::InsertElement);
802 if (LT.second.getScalarSizeInBits() == 1) {
803 if (HasScalar) {
804 // Example sequence:
805 // andi a0, a0, 1
806 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
807 // vmv.v.x v8, a0
808 // vmsne.vi v0, v8, 0
809 return LT.first *
810 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
811 LT.second, CostKind));
812 }
813 // Example sequence:
814 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
815 // vmv.v.i v8, 0
816 // vmerge.vim v8, v8, 1, v0
817 // vmv.x.s a0, v8
818 // andi a0, a0, 1
819 // vmv.v.x v8, a0
820 // vmsne.vi v0, v8, 0
821
822 return LT.first *
823 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
824 RISCV::VMV_X_S, RISCV::VMV_V_X,
825 RISCV::VMSNE_VI},
826 LT.second, CostKind));
827 }
828
829 if (HasScalar) {
830 // Example sequence:
831 // vmv.v.x v8, a0
832 return LT.first *
833 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
834 }
835
836 // Example sequence:
837 // vrgather.vi v9, v8, 0
838 return LT.first *
839 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
840 }
841 case TTI::SK_Splice: {
842 // vslidedown+vslideup.
843 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
844 // of similar code, but I think we expand through memory.
845 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
846 if (Index >= 0 && Index < 32)
847 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
848 else if (Index < 0 && Index > -32)
849 Opcodes[1] = RISCV::VSLIDEUP_VI;
850 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
851 }
852 case TTI::SK_Reverse: {
853
854 if (!LT.second.isVector())
856
857 // TODO: Cases to improve here:
858 // * Illegal vector types
859 // * i64 on RV32
860 if (SrcTy->getElementType()->isIntegerTy(1)) {
861 VectorType *WideTy =
862 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
863 cast<VectorType>(SrcTy)->getElementCount());
864 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
866 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
867 nullptr) +
868 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
870 }
871
872 MVT ContainerVT = LT.second;
873 if (LT.second.isFixedLengthVector())
874 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
875 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
876 if (ContainerVT.bitsLE(M1VT)) {
877 // Example sequence:
878 // csrr a0, vlenb
879 // srli a0, a0, 3
880 // addi a0, a0, -1
881 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
882 // vid.v v9
883 // vrsub.vx v10, v9, a0
884 // vrgather.vv v9, v8, v10
885 InstructionCost LenCost = 3;
886 if (LT.second.isFixedLengthVector())
887 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
888 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
889 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
890 if (LT.second.isFixedLengthVector() &&
891 isInt<5>(LT.second.getVectorNumElements() - 1))
892 Opcodes[1] = RISCV::VRSUB_VI;
893 InstructionCost GatherCost =
894 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
895 return LT.first * (LenCost + GatherCost);
896 }
897
898 // At high LMUL, we split into a series of M1 reverses (see
899 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
900 // the resulting gap at the bottom (for fixed vectors only). The important
901 // bit is that the cost scales linearly, not quadratically with LMUL.
902 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
903 InstructionCost FixedCost =
904 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
905 unsigned Ratio =
907 InstructionCost GatherCost =
908 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
909 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
910 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
911 return FixedCost + LT.first * (GatherCost + SlideCost);
912 }
913 }
914 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
915 SubTp);
916}
917
918static unsigned isM1OrSmaller(MVT VT) {
920 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
924}
925
927 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
928 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
929 ArrayRef<Value *> VL) const {
932
933 // A build_vector (which is m1 sized or smaller) can be done in no
934 // worse than one vslide1down.vx per element in the type. We could
935 // in theory do an explode_vector in the inverse manner, but our
936 // lowering today does not have a first class node for this pattern.
938 Ty, DemandedElts, Insert, Extract, CostKind);
939 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
940 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
941 if (Ty->getScalarSizeInBits() == 1) {
942 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
943 // Note: Implicit scalar anyextend is assumed to be free since the i1
944 // must be stored in a GPR.
945 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
946 CostKind) +
947 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
949 }
950
951 assert(LT.second.isFixedLengthVector());
952 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
953 if (isM1OrSmaller(ContainerVT)) {
954 InstructionCost BV =
955 cast<FixedVectorType>(Ty)->getNumElements() *
956 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
957 if (BV < Cost)
958 Cost = BV;
959 }
960 }
961 return Cost;
962}
963
965RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
966 unsigned AddressSpace,
968 if (!isLegalMaskedLoadStore(Src, Alignment) ||
970 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
971 CostKind);
972
973 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
974}
975
977 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
978 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
979 bool UseMaskForCond, bool UseMaskForGaps) const {
980
981 // The interleaved memory access pass will lower (de)interleave ops combined
982 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
983 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
984 // gap).
985 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
986 auto *VTy = cast<VectorType>(VecTy);
987 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
988 // Need to make sure type has't been scalarized
989 if (LT.second.isVector()) {
990 auto *SubVecTy =
991 VectorType::get(VTy->getElementType(),
992 VTy->getElementCount().divideCoefficientBy(Factor));
993 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
994 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
995 AddressSpace, DL)) {
996
997 // Some processors optimize segment loads/stores as one wide memory op +
998 // Factor * LMUL shuffle ops.
999 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1001 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1002 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1003 Cost += Factor * TLI->getLMULCost(SubVecVT);
1004 return LT.first * Cost;
1005 }
1006
1007 // Otherwise, the cost is proportional to the number of elements (VL *
1008 // Factor ops).
1009 InstructionCost MemOpCost =
1010 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1011 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1012 unsigned NumLoads = getEstimatedVLFor(VTy);
1013 return NumLoads * MemOpCost;
1014 }
1015 }
1016 }
1017
1018 // TODO: Return the cost of interleaved accesses for scalable vector when
1019 // unable to convert to segment accesses instructions.
1020 if (isa<ScalableVectorType>(VecTy))
1022
1023 auto *FVTy = cast<FixedVectorType>(VecTy);
1024 InstructionCost MemCost =
1025 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1026 unsigned VF = FVTy->getNumElements() / Factor;
1027
1028 // An interleaved load will look like this for Factor=3:
1029 // %wide.vec = load <12 x i32>, ptr %3, align 4
1030 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1031 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1032 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1033 if (Opcode == Instruction::Load) {
1034 InstructionCost Cost = MemCost;
1035 for (unsigned Index : Indices) {
1036 FixedVectorType *VecTy =
1037 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1038 auto Mask = createStrideMask(Index, Factor, VF);
1039 Mask.resize(VF * Factor, -1);
1040 InstructionCost ShuffleCost =
1042 Mask, CostKind, 0, nullptr, {});
1043 Cost += ShuffleCost;
1044 }
1045 return Cost;
1046 }
1047
1048 // TODO: Model for NF > 2
1049 // We'll need to enhance getShuffleCost to model shuffles that are just
1050 // inserts and extracts into subvectors, since they won't have the full cost
1051 // of a vrgather.
1052 // An interleaved store for 3 vectors of 4 lanes will look like
1053 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1054 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1055 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1056 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1057 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1058 if (Factor != 2)
1059 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1060 Alignment, AddressSpace, CostKind,
1061 UseMaskForCond, UseMaskForGaps);
1062
1063 assert(Opcode == Instruction::Store && "Opcode must be a store");
1064 // For an interleaving store of 2 vectors, we perform one large interleaving
1065 // shuffle that goes into the wide store
1066 auto Mask = createInterleaveMask(VF, Factor);
1067 InstructionCost ShuffleCost =
1069 CostKind, 0, nullptr, {});
1070 return MemCost + ShuffleCost;
1071}
1072
1074 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1075 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1077 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1078 Alignment, CostKind, I);
1079
1080 if ((Opcode == Instruction::Load &&
1081 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1082 (Opcode == Instruction::Store &&
1083 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1084 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1085 Alignment, CostKind, I);
1086
1087 // Cost is proportional to the number of memory operations implied. For
1088 // scalable vectors, we use an estimate on that number since we don't
1089 // know exactly what VL will be.
1090 auto &VTy = *cast<VectorType>(DataTy);
1091 InstructionCost MemOpCost =
1092 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1093 {TTI::OK_AnyValue, TTI::OP_None}, I);
1094 unsigned NumLoads = getEstimatedVLFor(&VTy);
1095 return NumLoads * MemOpCost;
1096}
1097
1099 unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
1100 TTI::TargetCostKind CostKind, const Instruction *I) const {
1101 bool IsLegal = (Opcode == Instruction::Store &&
1102 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1103 (Opcode == Instruction::Load &&
1104 isLegalMaskedExpandLoad(DataTy, Alignment));
1105 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1106 return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
1107 Alignment, CostKind, I);
1108 // Example compressstore sequence:
1109 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1110 // vcompress.vm v10, v8, v0
1111 // vcpop.m a1, v0
1112 // vsetvli zero, a1, e32, m2, ta, ma
1113 // vse32.v v10, (a0)
1114 // Example expandload sequence:
1115 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1116 // vcpop.m a1, v0
1117 // vsetvli zero, a1, e32, m2, ta, ma
1118 // vle32.v v10, (a0)
1119 // vsetivli zero, 8, e32, m2, ta, ma
1120 // viota.m v12, v0
1121 // vrgather.vv v8, v10, v12, v0.t
1122 auto MemOpCost =
1123 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1124 auto LT = getTypeLegalizationCost(DataTy);
1125 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1126 if (VariableMask)
1127 Opcodes.push_back(RISCV::VCPOP_M);
1128 if (Opcode == Instruction::Store)
1129 Opcodes.append({RISCV::VCOMPRESS_VM});
1130 else
1131 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1132 return MemOpCost +
1133 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1134}
1135
1137 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1138 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1139 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1140 !isLegalStridedLoadStore(DataTy, Alignment)) ||
1141 (Opcode != Instruction::Load && Opcode != Instruction::Store))
1142 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
1143 Alignment, CostKind, I);
1144
1146 return TTI::TCC_Basic;
1147
1148 // Cost is proportional to the number of memory operations implied. For
1149 // scalable vectors, we use an estimate on that number since we don't
1150 // know exactly what VL will be.
1151 auto &VTy = *cast<VectorType>(DataTy);
1152 InstructionCost MemOpCost =
1153 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1154 {TTI::OK_AnyValue, TTI::OP_None}, I);
1155 unsigned NumLoads = getEstimatedVLFor(&VTy);
1156 return NumLoads * MemOpCost;
1157}
1158
1161 // FIXME: This is a property of the default vector convention, not
1162 // all possible calling conventions. Fixing that will require
1163 // some TTI API and SLP rework.
1166 for (auto *Ty : Tys) {
1167 if (!Ty->isVectorTy())
1168 continue;
1169 Align A = DL.getPrefTypeAlign(Ty);
1170 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1171 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1172 }
1173 return Cost;
1174}
1175
1176// Currently, these represent both throughput and codesize costs
1177// for the respective intrinsics. The costs in this table are simply
1178// instruction counts with the following adjustments made:
1179// * One vsetvli is considered free.
1181 {Intrinsic::floor, MVT::f32, 9},
1182 {Intrinsic::floor, MVT::f64, 9},
1183 {Intrinsic::ceil, MVT::f32, 9},
1184 {Intrinsic::ceil, MVT::f64, 9},
1185 {Intrinsic::trunc, MVT::f32, 7},
1186 {Intrinsic::trunc, MVT::f64, 7},
1187 {Intrinsic::round, MVT::f32, 9},
1188 {Intrinsic::round, MVT::f64, 9},
1189 {Intrinsic::roundeven, MVT::f32, 9},
1190 {Intrinsic::roundeven, MVT::f64, 9},
1191 {Intrinsic::rint, MVT::f32, 7},
1192 {Intrinsic::rint, MVT::f64, 7},
1193 {Intrinsic::nearbyint, MVT::f32, 9},
1194 {Intrinsic::nearbyint, MVT::f64, 9},
1195 {Intrinsic::bswap, MVT::i16, 3},
1196 {Intrinsic::bswap, MVT::i32, 12},
1197 {Intrinsic::bswap, MVT::i64, 31},
1198 {Intrinsic::vp_bswap, MVT::i16, 3},
1199 {Intrinsic::vp_bswap, MVT::i32, 12},
1200 {Intrinsic::vp_bswap, MVT::i64, 31},
1201 {Intrinsic::vp_fshl, MVT::i8, 7},
1202 {Intrinsic::vp_fshl, MVT::i16, 7},
1203 {Intrinsic::vp_fshl, MVT::i32, 7},
1204 {Intrinsic::vp_fshl, MVT::i64, 7},
1205 {Intrinsic::vp_fshr, MVT::i8, 7},
1206 {Intrinsic::vp_fshr, MVT::i16, 7},
1207 {Intrinsic::vp_fshr, MVT::i32, 7},
1208 {Intrinsic::vp_fshr, MVT::i64, 7},
1209 {Intrinsic::bitreverse, MVT::i8, 17},
1210 {Intrinsic::bitreverse, MVT::i16, 24},
1211 {Intrinsic::bitreverse, MVT::i32, 33},
1212 {Intrinsic::bitreverse, MVT::i64, 52},
1213 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1214 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1215 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1216 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1217 {Intrinsic::ctpop, MVT::i8, 12},
1218 {Intrinsic::ctpop, MVT::i16, 19},
1219 {Intrinsic::ctpop, MVT::i32, 20},
1220 {Intrinsic::ctpop, MVT::i64, 21},
1221 {Intrinsic::ctlz, MVT::i8, 19},
1222 {Intrinsic::ctlz, MVT::i16, 28},
1223 {Intrinsic::ctlz, MVT::i32, 31},
1224 {Intrinsic::ctlz, MVT::i64, 35},
1225 {Intrinsic::cttz, MVT::i8, 16},
1226 {Intrinsic::cttz, MVT::i16, 23},
1227 {Intrinsic::cttz, MVT::i32, 24},
1228 {Intrinsic::cttz, MVT::i64, 25},
1229 {Intrinsic::vp_ctpop, MVT::i8, 12},
1230 {Intrinsic::vp_ctpop, MVT::i16, 19},
1231 {Intrinsic::vp_ctpop, MVT::i32, 20},
1232 {Intrinsic::vp_ctpop, MVT::i64, 21},
1233 {Intrinsic::vp_ctlz, MVT::i8, 19},
1234 {Intrinsic::vp_ctlz, MVT::i16, 28},
1235 {Intrinsic::vp_ctlz, MVT::i32, 31},
1236 {Intrinsic::vp_ctlz, MVT::i64, 35},
1237 {Intrinsic::vp_cttz, MVT::i8, 16},
1238 {Intrinsic::vp_cttz, MVT::i16, 23},
1239 {Intrinsic::vp_cttz, MVT::i32, 24},
1240 {Intrinsic::vp_cttz, MVT::i64, 25},
1241};
1242
1246 auto *RetTy = ICA.getReturnType();
1247 switch (ICA.getID()) {
1248 case Intrinsic::lrint:
1249 case Intrinsic::llrint:
1250 case Intrinsic::lround:
1251 case Intrinsic::llround: {
1252 auto LT = getTypeLegalizationCost(RetTy);
1253 Type *SrcTy = ICA.getArgTypes().front();
1254 auto SrcLT = getTypeLegalizationCost(SrcTy);
1255 if (ST->hasVInstructions() && LT.second.isVector()) {
1257 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1258 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1259 if (LT.second.getVectorElementType() == MVT::bf16) {
1260 if (!ST->hasVInstructionsBF16Minimal())
1262 if (DstEltSz == 32)
1263 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1264 else
1265 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1266 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1267 !ST->hasVInstructionsF16()) {
1268 if (!ST->hasVInstructionsF16Minimal())
1270 if (DstEltSz == 32)
1271 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1272 else
1273 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1274
1275 } else if (SrcEltSz > DstEltSz) {
1276 Ops = {RISCV::VFNCVT_X_F_W};
1277 } else if (SrcEltSz < DstEltSz) {
1278 Ops = {RISCV::VFWCVT_X_F_V};
1279 } else {
1280 Ops = {RISCV::VFCVT_X_F_V};
1281 }
1282
1283 // We need to use the source LMUL in the case of a narrowing op, and the
1284 // destination LMUL otherwise.
1285 if (SrcEltSz > DstEltSz)
1286 return SrcLT.first *
1287 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1288 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1289 }
1290 break;
1291 }
1292 case Intrinsic::ceil:
1293 case Intrinsic::floor:
1294 case Intrinsic::trunc:
1295 case Intrinsic::rint:
1296 case Intrinsic::round:
1297 case Intrinsic::roundeven: {
1298 // These all use the same code.
1299 auto LT = getTypeLegalizationCost(RetTy);
1300 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1301 return LT.first * 8;
1302 break;
1303 }
1304 case Intrinsic::umin:
1305 case Intrinsic::umax:
1306 case Intrinsic::smin:
1307 case Intrinsic::smax: {
1308 auto LT = getTypeLegalizationCost(RetTy);
1309 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1310 return LT.first;
1311
1312 if (ST->hasVInstructions() && LT.second.isVector()) {
1313 unsigned Op;
1314 switch (ICA.getID()) {
1315 case Intrinsic::umin:
1316 Op = RISCV::VMINU_VV;
1317 break;
1318 case Intrinsic::umax:
1319 Op = RISCV::VMAXU_VV;
1320 break;
1321 case Intrinsic::smin:
1322 Op = RISCV::VMIN_VV;
1323 break;
1324 case Intrinsic::smax:
1325 Op = RISCV::VMAX_VV;
1326 break;
1327 }
1328 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1329 }
1330 break;
1331 }
1332 case Intrinsic::sadd_sat:
1333 case Intrinsic::ssub_sat:
1334 case Intrinsic::uadd_sat:
1335 case Intrinsic::usub_sat: {
1336 auto LT = getTypeLegalizationCost(RetTy);
1337 if (ST->hasVInstructions() && LT.second.isVector()) {
1338 unsigned Op;
1339 switch (ICA.getID()) {
1340 case Intrinsic::sadd_sat:
1341 Op = RISCV::VSADD_VV;
1342 break;
1343 case Intrinsic::ssub_sat:
1344 Op = RISCV::VSSUBU_VV;
1345 break;
1346 case Intrinsic::uadd_sat:
1347 Op = RISCV::VSADDU_VV;
1348 break;
1349 case Intrinsic::usub_sat:
1350 Op = RISCV::VSSUBU_VV;
1351 break;
1352 }
1353 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1354 }
1355 break;
1356 }
1357 case Intrinsic::fma:
1358 case Intrinsic::fmuladd: {
1359 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1360 auto LT = getTypeLegalizationCost(RetTy);
1361 if (ST->hasVInstructions() && LT.second.isVector())
1362 return LT.first *
1363 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1364 break;
1365 }
1366 case Intrinsic::fabs: {
1367 auto LT = getTypeLegalizationCost(RetTy);
1368 if (ST->hasVInstructions() && LT.second.isVector()) {
1369 // lui a0, 8
1370 // addi a0, a0, -1
1371 // vsetvli a1, zero, e16, m1, ta, ma
1372 // vand.vx v8, v8, a0
1373 // f16 with zvfhmin and bf16 with zvfhbmin
1374 if (LT.second.getVectorElementType() == MVT::bf16 ||
1375 (LT.second.getVectorElementType() == MVT::f16 &&
1376 !ST->hasVInstructionsF16()))
1377 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1378 CostKind) +
1379 2;
1380 else
1381 return LT.first *
1382 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1383 }
1384 break;
1385 }
1386 case Intrinsic::sqrt: {
1387 auto LT = getTypeLegalizationCost(RetTy);
1388 if (ST->hasVInstructions() && LT.second.isVector()) {
1391 MVT ConvType = LT.second;
1392 MVT FsqrtType = LT.second;
1393 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1394 // will be spilt.
1395 if (LT.second.getVectorElementType() == MVT::bf16) {
1396 if (LT.second == MVT::nxv32bf16) {
1397 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1398 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1399 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1400 ConvType = MVT::nxv16f16;
1401 FsqrtType = MVT::nxv16f32;
1402 } else {
1403 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1404 FsqrtOp = {RISCV::VFSQRT_V};
1405 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1406 }
1407 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1408 !ST->hasVInstructionsF16()) {
1409 if (LT.second == MVT::nxv32f16) {
1410 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1411 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1412 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1413 ConvType = MVT::nxv16f16;
1414 FsqrtType = MVT::nxv16f32;
1415 } else {
1416 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1417 FsqrtOp = {RISCV::VFSQRT_V};
1418 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1419 }
1420 } else {
1421 FsqrtOp = {RISCV::VFSQRT_V};
1422 }
1423
1424 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1425 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1426 }
1427 break;
1428 }
1429 case Intrinsic::cttz:
1430 case Intrinsic::ctlz:
1431 case Intrinsic::ctpop: {
1432 auto LT = getTypeLegalizationCost(RetTy);
1433 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1434 unsigned Op;
1435 switch (ICA.getID()) {
1436 case Intrinsic::cttz:
1437 Op = RISCV::VCTZ_V;
1438 break;
1439 case Intrinsic::ctlz:
1440 Op = RISCV::VCLZ_V;
1441 break;
1442 case Intrinsic::ctpop:
1443 Op = RISCV::VCPOP_V;
1444 break;
1445 }
1446 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1447 }
1448 break;
1449 }
1450 case Intrinsic::abs: {
1451 auto LT = getTypeLegalizationCost(RetTy);
1452 if (ST->hasVInstructions() && LT.second.isVector()) {
1453 // vrsub.vi v10, v8, 0
1454 // vmax.vv v8, v8, v10
1455 return LT.first *
1456 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1457 LT.second, CostKind);
1458 }
1459 break;
1460 }
1461 case Intrinsic::get_active_lane_mask: {
1462 if (ST->hasVInstructions()) {
1463 Type *ExpRetTy = VectorType::get(
1464 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1465 auto LT = getTypeLegalizationCost(ExpRetTy);
1466
1467 // vid.v v8 // considered hoisted
1468 // vsaddu.vx v8, v8, a0
1469 // vmsltu.vx v0, v8, a1
1470 return LT.first *
1471 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1472 LT.second, CostKind);
1473 }
1474 break;
1475 }
1476 // TODO: add more intrinsic
1477 case Intrinsic::stepvector: {
1478 auto LT = getTypeLegalizationCost(RetTy);
1479 // Legalisation of illegal types involves an `index' instruction plus
1480 // (LT.first - 1) vector adds.
1481 if (ST->hasVInstructions())
1482 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1483 (LT.first - 1) *
1484 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1485 return 1 + (LT.first - 1);
1486 }
1487 case Intrinsic::experimental_cttz_elts: {
1488 Type *ArgTy = ICA.getArgTypes()[0];
1489 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1490 if (getTLI()->shouldExpandCttzElements(ArgType))
1491 break;
1492 InstructionCost Cost = getRISCVInstructionCost(
1493 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1494
1495 // If zero_is_poison is false, then we will generate additional
1496 // cmp + select instructions to convert -1 to EVL.
1497 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1498 if (ICA.getArgs().size() > 1 &&
1499 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1500 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1502 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1504
1505 return Cost;
1506 }
1507 case Intrinsic::experimental_vp_splat: {
1508 auto LT = getTypeLegalizationCost(RetTy);
1509 // TODO: Lower i1 experimental_vp_splat
1510 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1512 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1513 ? RISCV::VFMV_V_F
1514 : RISCV::VMV_V_X,
1515 LT.second, CostKind);
1516 }
1517 case Intrinsic::experimental_vp_splice: {
1518 // To support type-based query from vectorizer, set the index to 0.
1519 // Note that index only change the cost from vslide.vx to vslide.vi and in
1520 // current implementations they have same costs.
1522 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1524 }
1525 case Intrinsic::fptoui_sat:
1526 case Intrinsic::fptosi_sat: {
1528 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1529 Type *SrcTy = ICA.getArgTypes()[0];
1530
1531 auto SrcLT = getTypeLegalizationCost(SrcTy);
1532 auto DstLT = getTypeLegalizationCost(RetTy);
1533 if (!SrcTy->isVectorTy())
1534 break;
1535
1536 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1538
1539 Cost +=
1540 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1541 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1542
1543 // Handle NaN.
1544 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1545 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1546 Type *CondTy = RetTy->getWithNewBitWidth(1);
1547 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1549 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1551 return Cost;
1552 }
1553 }
1554
1555 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1556 if (auto LT = getTypeLegalizationCost(RetTy);
1557 LT.second.isVector()) {
1558 MVT EltTy = LT.second.getVectorElementType();
1559 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1560 ICA.getID(), EltTy))
1561 return LT.first * Entry->Cost;
1562 }
1563 }
1564
1566}
1567
1570 const SCEV *Ptr,
1572 // Address computations for vector indexed load/store likely require an offset
1573 // and/or scaling.
1574 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1575 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1576
1577 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1578}
1579
1581 Type *Src,
1584 const Instruction *I) const {
1585 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1586 if (!IsVectorType)
1587 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1588
1589 // FIXME: Need to compute legalizing cost for illegal types. The current
1590 // code handles only legal types and those which can be trivially
1591 // promoted to legal.
1592 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1593 Dst->getScalarSizeInBits() > ST->getELen())
1594 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1595
1596 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1597 assert(ISD && "Invalid opcode");
1598 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1599 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1600
1601 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1602 // The shared implementation doesn't model vector widening during legalization
1603 // and instead assumes scalarization. In order to scalarize an <N x i1>
1604 // vector, we need to extend/trunc to/from i8. If we don't special case
1605 // this, we can get an infinite recursion cycle.
1606 switch (ISD) {
1607 default:
1608 break;
1609 case ISD::SIGN_EXTEND:
1610 case ISD::ZERO_EXTEND:
1611 if (Src->getScalarSizeInBits() == 1) {
1612 // We do not use vsext/vzext to extend from mask vector.
1613 // Instead we use the following instructions to extend from mask vector:
1614 // vmv.v.i v8, 0
1615 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1616 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1617 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1618 DstLT.second, CostKind) +
1619 DstLT.first - 1;
1620 }
1621 break;
1622 case ISD::TRUNCATE:
1623 if (Dst->getScalarSizeInBits() == 1) {
1624 // We do not use several vncvt to truncate to mask vector. So we could
1625 // not use PowDiff to calculate it.
1626 // Instead we use the following instructions to truncate to mask vector:
1627 // vand.vi v8, v8, 1
1628 // vmsne.vi v0, v8, 0
1629 return SrcLT.first *
1630 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1631 SrcLT.second, CostKind) +
1632 SrcLT.first - 1;
1633 }
1634 break;
1635 };
1636
1637 // Our actual lowering for the case where a wider legal type is available
1638 // uses promotion to the wider type. This is reflected in the result of
1639 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1640 // scalarized if the legalized Src and Dst are not equal sized.
1641 const DataLayout &DL = this->getDataLayout();
1642 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1643 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1644 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1645 SrcLT.second.getSizeInBits()) ||
1646 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1647 DstLT.second.getSizeInBits()))
1648 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1649
1650 // The split cost is handled by the base getCastInstrCost
1651 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1652
1653 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1654 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1655 switch (ISD) {
1656 case ISD::SIGN_EXTEND:
1657 case ISD::ZERO_EXTEND: {
1658 if ((PowDiff < 1) || (PowDiff > 3))
1659 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1660 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1661 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1662 unsigned Op =
1663 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1664 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1665 }
1666 case ISD::TRUNCATE:
1667 case ISD::FP_EXTEND:
1668 case ISD::FP_ROUND: {
1669 // Counts of narrow/widen instructions.
1670 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1671 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1672
1673 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1674 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1675 : RISCV::VFNCVT_F_F_W;
1677 for (; SrcEltSize != DstEltSize;) {
1678 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1679 ? MVT::getIntegerVT(DstEltSize)
1680 : MVT::getFloatingPointVT(DstEltSize);
1681 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1682 DstEltSize =
1683 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1684 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1685 }
1686 return Cost;
1687 }
1688 case ISD::FP_TO_SINT:
1689 case ISD::FP_TO_UINT: {
1690 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1691 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1692 unsigned FWCVT =
1693 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1694 unsigned FNCVT =
1695 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1696 unsigned SrcEltSize = Src->getScalarSizeInBits();
1697 unsigned DstEltSize = Dst->getScalarSizeInBits();
1699 if ((SrcEltSize == 16) &&
1700 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1701 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1702 // pre-widening to f32 and then convert f32 to integer
1703 VectorType *VecF32Ty =
1704 VectorType::get(Type::getFloatTy(Dst->getContext()),
1705 cast<VectorType>(Dst)->getElementCount());
1706 std::pair<InstructionCost, MVT> VecF32LT =
1707 getTypeLegalizationCost(VecF32Ty);
1708 Cost +=
1709 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1710 VecF32LT.second, CostKind);
1711 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1712 return Cost;
1713 }
1714 if (DstEltSize == SrcEltSize)
1715 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1716 else if (DstEltSize > SrcEltSize)
1717 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1718 else { // (SrcEltSize > DstEltSize)
1719 // First do a narrowing conversion to an integer half the size, then
1720 // truncate if needed.
1721 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1722 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1723 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1724 if ((SrcEltSize / 2) > DstEltSize) {
1725 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1726 Cost +=
1727 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1728 }
1729 }
1730 return Cost;
1731 }
1732 case ISD::SINT_TO_FP:
1733 case ISD::UINT_TO_FP: {
1734 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1735 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1736 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1737 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1738 unsigned SrcEltSize = Src->getScalarSizeInBits();
1739 unsigned DstEltSize = Dst->getScalarSizeInBits();
1740
1742 if ((DstEltSize == 16) &&
1743 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1744 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1745 // it is converted to f32 and then converted to f16
1746 VectorType *VecF32Ty =
1747 VectorType::get(Type::getFloatTy(Dst->getContext()),
1748 cast<VectorType>(Dst)->getElementCount());
1749 std::pair<InstructionCost, MVT> VecF32LT =
1750 getTypeLegalizationCost(VecF32Ty);
1751 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1752 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1753 DstLT.second, CostKind);
1754 return Cost;
1755 }
1756
1757 if (DstEltSize == SrcEltSize)
1758 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1759 else if (DstEltSize > SrcEltSize) {
1760 if ((DstEltSize / 2) > SrcEltSize) {
1761 VectorType *VecTy =
1762 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1763 cast<VectorType>(Dst)->getElementCount());
1764 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1765 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1766 }
1767 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1768 } else
1769 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1770 return Cost;
1771 }
1772 }
1773 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1774}
1775
1776unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1777 if (isa<ScalableVectorType>(Ty)) {
1778 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1779 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1780 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1781 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1782 }
1783 return cast<FixedVectorType>(Ty)->getNumElements();
1784}
1785
1788 FastMathFlags FMF,
1790 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1791 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1792
1793 // Skip if scalar size of Ty is bigger than ELEN.
1794 if (Ty->getScalarSizeInBits() > ST->getELen())
1795 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1796
1797 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1798 if (Ty->getElementType()->isIntegerTy(1)) {
1799 // SelectionDAGBuilder does following transforms:
1800 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1801 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1802 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1803 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1804 else
1805 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1806 }
1807
1808 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1810 InstructionCost ExtraCost = 0;
1811 switch (IID) {
1812 case Intrinsic::maximum:
1813 if (FMF.noNaNs()) {
1814 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1815 } else {
1816 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1817 RISCV::VFMV_F_S};
1818 // Cost of Canonical Nan + branch
1819 // lui a0, 523264
1820 // fmv.w.x fa0, a0
1821 Type *DstTy = Ty->getScalarType();
1822 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1823 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1824 ExtraCost = 1 +
1825 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1827 getCFInstrCost(Instruction::Br, CostKind);
1828 }
1829 break;
1830
1831 case Intrinsic::minimum:
1832 if (FMF.noNaNs()) {
1833 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1834 } else {
1835 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1836 RISCV::VFMV_F_S};
1837 // Cost of Canonical Nan + branch
1838 // lui a0, 523264
1839 // fmv.w.x fa0, a0
1840 Type *DstTy = Ty->getScalarType();
1841 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1842 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1843 ExtraCost = 1 +
1844 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1846 getCFInstrCost(Instruction::Br, CostKind);
1847 }
1848 break;
1849 }
1850 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1851 }
1852
1853 // IR Reduction is composed by one rvv reduction instruction and vmv
1854 unsigned SplitOp;
1856 switch (IID) {
1857 default:
1858 llvm_unreachable("Unsupported intrinsic");
1859 case Intrinsic::smax:
1860 SplitOp = RISCV::VMAX_VV;
1861 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1862 break;
1863 case Intrinsic::smin:
1864 SplitOp = RISCV::VMIN_VV;
1865 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1866 break;
1867 case Intrinsic::umax:
1868 SplitOp = RISCV::VMAXU_VV;
1869 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1870 break;
1871 case Intrinsic::umin:
1872 SplitOp = RISCV::VMINU_VV;
1873 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1874 break;
1875 case Intrinsic::maxnum:
1876 SplitOp = RISCV::VFMAX_VV;
1877 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1878 break;
1879 case Intrinsic::minnum:
1880 SplitOp = RISCV::VFMIN_VV;
1881 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1882 break;
1883 }
1884 // Add a cost for data larger than LMUL8
1885 InstructionCost SplitCost =
1886 (LT.first > 1) ? (LT.first - 1) *
1887 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1888 : 0;
1889 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1890}
1891
1894 std::optional<FastMathFlags> FMF,
1896 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1897 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1898
1899 // Skip if scalar size of Ty is bigger than ELEN.
1900 if (Ty->getScalarSizeInBits() > ST->getELen())
1901 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1902
1903 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1904 assert(ISD && "Invalid opcode");
1905
1906 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1907 ISD != ISD::FADD)
1908 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1909
1910 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1911 Type *ElementTy = Ty->getElementType();
1912 if (ElementTy->isIntegerTy(1)) {
1913 // Example sequences:
1914 // vfirst.m a0, v0
1915 // seqz a0, a0
1916 if (LT.second == MVT::v1i1)
1917 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
1918 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1920
1921 if (ISD == ISD::AND) {
1922 // Example sequences:
1923 // vmand.mm v8, v9, v8 ; needed every time type is split
1924 // vmnot.m v8, v0 ; alias for vmnand
1925 // vcpop.m a0, v8
1926 // seqz a0, a0
1927
1928 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
1929 // For LMUL <= 8, there is no splitting,
1930 // the sequences are vmnot, vcpop and seqz.
1931 // When LMUL > 8 and split = 1,
1932 // the sequences are vmnand, vcpop and seqz.
1933 // When LMUL > 8 and split > 1,
1934 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
1935 return ((LT.first > 2) ? (LT.first - 2) : 0) *
1936 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
1937 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
1938 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1939 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1941 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
1942 // Example sequences:
1943 // vsetvli a0, zero, e8, mf8, ta, ma
1944 // vmxor.mm v8, v0, v8 ; needed every time type is split
1945 // vcpop.m a0, v8
1946 // andi a0, a0, 1
1947 return (LT.first - 1) *
1948 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
1949 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
1950 } else {
1951 assert(ISD == ISD::OR);
1952 // Example sequences:
1953 // vsetvli a0, zero, e8, mf8, ta, ma
1954 // vmor.mm v8, v9, v8 ; needed every time type is split
1955 // vcpop.m a0, v0
1956 // snez a0, a0
1957 return (LT.first - 1) *
1958 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
1959 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1960 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1962 }
1963 }
1964
1965 // IR Reduction of or/and is composed by one vmv and one rvv reduction
1966 // instruction, and others is composed by two vmv and one rvv reduction
1967 // instruction
1968 unsigned SplitOp;
1970 switch (ISD) {
1971 case ISD::ADD:
1972 SplitOp = RISCV::VADD_VV;
1973 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1974 break;
1975 case ISD::OR:
1976 SplitOp = RISCV::VOR_VV;
1977 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
1978 break;
1979 case ISD::XOR:
1980 SplitOp = RISCV::VXOR_VV;
1981 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1982 break;
1983 case ISD::AND:
1984 SplitOp = RISCV::VAND_VV;
1985 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
1986 break;
1987 case ISD::FADD:
1988 // We can't promote f16/bf16 fadd reductions.
1989 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
1990 LT.second.getScalarType() == MVT::bf16)
1991 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1993 Opcodes.push_back(RISCV::VFMV_S_F);
1994 for (unsigned i = 0; i < LT.first.getValue(); i++)
1995 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1996 Opcodes.push_back(RISCV::VFMV_F_S);
1997 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1998 }
1999 SplitOp = RISCV::VFADD_VV;
2000 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2001 break;
2002 }
2003 // Add a cost for data larger than LMUL8
2004 InstructionCost SplitCost =
2005 (LT.first > 1) ? (LT.first - 1) *
2006 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2007 : 0;
2008 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2009}
2010
2012 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2013 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2014 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2015 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2016 FMF, CostKind);
2017
2018 // Skip if scalar size of ResTy is bigger than ELEN.
2019 if (ResTy->getScalarSizeInBits() > ST->getELen())
2020 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2021 FMF, CostKind);
2022
2023 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2024 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2025 FMF, CostKind);
2026
2027 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2028
2029 if (IsUnsigned && Opcode == Instruction::Add &&
2030 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2031 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2032 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2033 return LT.first *
2034 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2035 }
2036
2037 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2038 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2039 FMF, CostKind);
2040
2041 return (LT.first - 1) +
2042 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2043}
2044
2048 assert(OpInfo.isConstant() && "non constant operand?");
2049 if (!isa<VectorType>(Ty))
2050 // FIXME: We need to account for immediate materialization here, but doing
2051 // a decent job requires more knowledge about the immediate than we
2052 // currently have here.
2053 return 0;
2054
2055 if (OpInfo.isUniform())
2056 // vmv.v.i, vmv.v.x, or vfmv.v.f
2057 // We ignore the cost of the scalar constant materialization to be consistent
2058 // with how we treat scalar constants themselves just above.
2059 return 1;
2060
2061 return getConstantPoolLoadCost(Ty, CostKind);
2062}
2063
2065 Align Alignment,
2066 unsigned AddressSpace,
2068 TTI::OperandValueInfo OpInfo,
2069 const Instruction *I) const {
2070 EVT VT = TLI->getValueType(DL, Src, true);
2071 // Type legalization can't handle structs
2072 if (VT == MVT::Other)
2073 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2074 CostKind, OpInfo, I);
2075
2077 if (Opcode == Instruction::Store && OpInfo.isConstant())
2078 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2079
2080 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2081
2082 InstructionCost BaseCost = [&]() {
2083 InstructionCost Cost = LT.first;
2085 return Cost;
2086
2087 // Our actual lowering for the case where a wider legal type is available
2088 // uses the a VL predicated load on the wider type. This is reflected in
2089 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2090 // widened cases are scalarized.
2091 const DataLayout &DL = this->getDataLayout();
2092 if (Src->isVectorTy() && LT.second.isVector() &&
2093 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2094 LT.second.getSizeInBits()))
2095 return Cost;
2096
2097 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2098 CostKind, OpInfo, I);
2099 }();
2100
2101 // Assume memory ops cost scale with the number of vector registers
2102 // possible accessed by the instruction. Note that BasicTTI already
2103 // handles the LT.first term for us.
2104 if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
2105 BaseCost *= TLI->getLMULCost(LT.second);
2106 return Cost + BaseCost;
2107}
2108
2110 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2112 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2114 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2115 Op1Info, Op2Info, I);
2116
2117 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2118 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2119 Op1Info, Op2Info, I);
2120
2121 // Skip if scalar size of ValTy is bigger than ELEN.
2122 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2123 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2124 Op1Info, Op2Info, I);
2125
2126 auto GetConstantMatCost =
2127 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2128 if (OpInfo.isUniform())
2129 // We return 0 we currently ignore the cost of materializing scalar
2130 // constants in GPRs.
2131 return 0;
2132
2133 return getConstantPoolLoadCost(ValTy, CostKind);
2134 };
2135
2136 InstructionCost ConstantMatCost;
2137 if (Op1Info.isConstant())
2138 ConstantMatCost += GetConstantMatCost(Op1Info);
2139 if (Op2Info.isConstant())
2140 ConstantMatCost += GetConstantMatCost(Op2Info);
2141
2142 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2143 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2144 if (CondTy->isVectorTy()) {
2145 if (ValTy->getScalarSizeInBits() == 1) {
2146 // vmandn.mm v8, v8, v9
2147 // vmand.mm v9, v0, v9
2148 // vmor.mm v0, v9, v8
2149 return ConstantMatCost +
2150 LT.first *
2151 getRISCVInstructionCost(
2152 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2153 LT.second, CostKind);
2154 }
2155 // vselect and max/min are supported natively.
2156 return ConstantMatCost +
2157 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2158 CostKind);
2159 }
2160
2161 if (ValTy->getScalarSizeInBits() == 1) {
2162 // vmv.v.x v9, a0
2163 // vmsne.vi v9, v9, 0
2164 // vmandn.mm v8, v8, v9
2165 // vmand.mm v9, v0, v9
2166 // vmor.mm v0, v9, v8
2167 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2168 return ConstantMatCost +
2169 LT.first *
2170 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2171 InterimVT, CostKind) +
2172 LT.first * getRISCVInstructionCost(
2173 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2174 LT.second, CostKind);
2175 }
2176
2177 // vmv.v.x v10, a0
2178 // vmsne.vi v0, v10, 0
2179 // vmerge.vvm v8, v9, v8, v0
2180 return ConstantMatCost +
2181 LT.first * getRISCVInstructionCost(
2182 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2183 LT.second, CostKind);
2184 }
2185
2186 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2187 CmpInst::isIntPredicate(VecPred)) {
2188 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2189 // provided they incur the same cost across all implementations
2190 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2191 LT.second,
2192 CostKind);
2193 }
2194
2195 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2196 CmpInst::isFPPredicate(VecPred)) {
2197
2198 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2199 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2200 return ConstantMatCost +
2201 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2202
2203 // If we do not support the input floating point vector type, use the base
2204 // one which will calculate as:
2205 // ScalarizeCost + Num * Cost for fixed vector,
2206 // InvalidCost for scalable vector.
2207 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2208 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2209 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2210 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2211 Op1Info, Op2Info, I);
2212
2213 // Assuming vector fp compare and mask instructions are all the same cost
2214 // until a need arises to differentiate them.
2215 switch (VecPred) {
2216 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2217 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2218 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2219 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2220 return ConstantMatCost +
2221 LT.first * getRISCVInstructionCost(
2222 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2223 LT.second, CostKind);
2224
2225 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2226 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2227 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2228 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2229 return ConstantMatCost +
2230 LT.first *
2231 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2232 LT.second, CostKind);
2233
2234 case CmpInst::FCMP_OEQ: // vmfeq.vv
2235 case CmpInst::FCMP_OGT: // vmflt.vv
2236 case CmpInst::FCMP_OGE: // vmfle.vv
2237 case CmpInst::FCMP_OLT: // vmflt.vv
2238 case CmpInst::FCMP_OLE: // vmfle.vv
2239 case CmpInst::FCMP_UNE: // vmfne.vv
2240 return ConstantMatCost +
2241 LT.first *
2242 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2243 default:
2244 break;
2245 }
2246 }
2247
2248 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2249 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2250 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2251 // be (0 + select instr cost).
2252 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2253 ValTy->isIntegerTy() && !I->user_empty()) {
2254 if (all_of(I->users(), [&](const User *U) {
2255 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2256 U->getType()->isIntegerTy() &&
2257 !isa<ConstantData>(U->getOperand(1)) &&
2258 !isa<ConstantData>(U->getOperand(2));
2259 }))
2260 return 0;
2261 }
2262
2263 // TODO: Add cost for scalar type.
2264
2265 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2266 Op1Info, Op2Info, I);
2267}
2268
2271 const Instruction *I) const {
2273 return Opcode == Instruction::PHI ? 0 : 1;
2274 // Branches are assumed to be predicted.
2275 return 0;
2276}
2277
2280 unsigned Index,
2281 const Value *Op0,
2282 const Value *Op1) const {
2283 assert(Val->isVectorTy() && "This must be a vector type");
2284
2285 if (Opcode != Instruction::ExtractElement &&
2286 Opcode != Instruction::InsertElement)
2287 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2288
2289 // Legalize the type.
2290 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2291
2292 // This type is legalized to a scalar type.
2293 if (!LT.second.isVector()) {
2294 auto *FixedVecTy = cast<FixedVectorType>(Val);
2295 // If Index is a known constant, cost is zero.
2296 if (Index != -1U)
2297 return 0;
2298 // Extract/InsertElement with non-constant index is very costly when
2299 // scalarized; estimate cost of loads/stores sequence via the stack:
2300 // ExtractElement cost: store vector to stack, load scalar;
2301 // InsertElement cost: store vector to stack, store scalar, load vector.
2302 Type *ElemTy = FixedVecTy->getElementType();
2303 auto NumElems = FixedVecTy->getNumElements();
2304 auto Align = DL.getPrefTypeAlign(ElemTy);
2305 InstructionCost LoadCost =
2306 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2307 InstructionCost StoreCost =
2308 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2309 return Opcode == Instruction::ExtractElement
2310 ? StoreCost * NumElems + LoadCost
2311 : (StoreCost + LoadCost) * NumElems + StoreCost;
2312 }
2313
2314 // For unsupported scalable vector.
2315 if (LT.second.isScalableVector() && !LT.first.isValid())
2316 return LT.first;
2317
2318 // Mask vector extract/insert is expanded via e8.
2319 if (Val->getScalarSizeInBits() == 1) {
2320 VectorType *WideTy =
2322 cast<VectorType>(Val)->getElementCount());
2323 if (Opcode == Instruction::ExtractElement) {
2324 InstructionCost ExtendCost
2325 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2327 InstructionCost ExtractCost
2328 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2329 return ExtendCost + ExtractCost;
2330 }
2331 InstructionCost ExtendCost
2332 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2334 InstructionCost InsertCost
2335 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2336 InstructionCost TruncCost
2337 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2339 return ExtendCost + InsertCost + TruncCost;
2340 }
2341
2342
2343 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2344 // and vslideup + vmv.s.x to insert element to vector.
2345 unsigned BaseCost = 1;
2346 // When insertelement we should add the index with 1 as the input of vslideup.
2347 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2348
2349 if (Index != -1U) {
2350 // The type may be split. For fixed-width vectors we can normalize the
2351 // index to the new type.
2352 if (LT.second.isFixedLengthVector()) {
2353 unsigned Width = LT.second.getVectorNumElements();
2354 Index = Index % Width;
2355 }
2356
2357 // If exact VLEN is known, we will insert/extract into the appropriate
2358 // subvector with no additional subvector insert/extract cost.
2359 if (auto VLEN = ST->getRealVLen()) {
2360 unsigned EltSize = LT.second.getScalarSizeInBits();
2361 unsigned M1Max = *VLEN / EltSize;
2362 Index = Index % M1Max;
2363 }
2364
2365 if (Index == 0)
2366 // We can extract/insert the first element without vslidedown/vslideup.
2367 SlideCost = 0;
2368 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2369 Val->getScalarType()->isIntegerTy())
2370 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2371 else if (Opcode == Instruction::InsertElement)
2372 SlideCost = 1; // With a constant index, we do not need to use addi.
2373 }
2374
2375 // When the vector needs to split into multiple register groups and the index
2376 // exceeds single vector register group, we need to insert/extract the element
2377 // via stack.
2378 if (LT.first > 1 &&
2379 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2380 LT.second.isScalableVector()))) {
2381 Type *ScalarType = Val->getScalarType();
2382 Align VecAlign = DL.getPrefTypeAlign(Val);
2383 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2384 // Extra addi for unknown index.
2385 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2386
2387 // Store all split vectors into stack and load the target element.
2388 if (Opcode == Instruction::ExtractElement)
2389 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2390 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2391 CostKind) +
2392 IdxCost;
2393
2394 // Store all split vectors into stack and store the target element and load
2395 // vectors back.
2396 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2397 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2398 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2399 CostKind) +
2400 IdxCost;
2401 }
2402
2403 // Extract i64 in the target that has XLEN=32 need more instruction.
2404 if (Val->getScalarType()->isIntegerTy() &&
2405 ST->getXLen() < Val->getScalarSizeInBits()) {
2406 // For extractelement, we need the following instructions:
2407 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2408 // vslidedown.vx v8, v8, a0
2409 // vmv.x.s a0, v8
2410 // li a1, 32
2411 // vsrl.vx v8, v8, a1
2412 // vmv.x.s a1, v8
2413
2414 // For insertelement, we need the following instructions:
2415 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2416 // vmv.v.i v12, 0
2417 // vslide1up.vx v16, v12, a1
2418 // vslide1up.vx v12, v16, a0
2419 // addi a0, a2, 1
2420 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2421 // vslideup.vx v8, v12, a2
2422
2423 // TODO: should we count these special vsetvlis?
2424 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2425 }
2426 return BaseCost + SlideCost;
2427}
2428
2432 unsigned Index) const {
2433 if (isa<FixedVectorType>(Val))
2435 Index);
2436
2437 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2438 // for the cost of extracting the last lane of a scalable vector. It probably
2439 // needs a more accurate cost.
2440 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2441 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2442 return getVectorInstrCost(Opcode, Val, CostKind,
2443 EC.getKnownMinValue() - 1 - Index, nullptr,
2444 nullptr);
2445}
2446
2448 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2450 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2451
2452 // TODO: Handle more cost kinds.
2454 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2455 Args, CxtI);
2456
2457 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2458 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2459 Args, CxtI);
2460
2461 // Skip if scalar size of Ty is bigger than ELEN.
2462 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2463 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2464 Args, CxtI);
2465
2466 // Legalize the type.
2467 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2468
2469 // TODO: Handle scalar type.
2470 if (!LT.second.isVector())
2471 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2472 Args, CxtI);
2473
2474 // f16 with zvfhmin and bf16 will be promoted to f32.
2475 // FIXME: nxv32[b]f16 will be custom lowered and split.
2476 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2477 InstructionCost CastCost = 0;
2478 if ((LT.second.getVectorElementType() == MVT::f16 ||
2479 LT.second.getVectorElementType() == MVT::bf16) &&
2480 TLI->getOperationAction(ISDOpcode, LT.second) ==
2482 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2483 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2484 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2485 // Add cost of extending arguments
2486 CastCost += LT.first * Args.size() *
2487 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2489 // Add cost of truncating result
2490 CastCost +=
2491 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2493 // Compute cost of op in promoted type
2494 LT.second = PromotedVT;
2495 }
2496
2497 auto getConstantMatCost =
2498 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2499 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2500 // Two sub-cases:
2501 // * Has a 5 bit immediate operand which can be splatted.
2502 // * Has a larger immediate which must be materialized in scalar register
2503 // We return 0 for both as we currently ignore the cost of materializing
2504 // scalar constants in GPRs.
2505 return 0;
2506
2507 return getConstantPoolLoadCost(Ty, CostKind);
2508 };
2509
2510 // Add the cost of materializing any constant vectors required.
2511 InstructionCost ConstantMatCost = 0;
2512 if (Op1Info.isConstant())
2513 ConstantMatCost += getConstantMatCost(0, Op1Info);
2514 if (Op2Info.isConstant())
2515 ConstantMatCost += getConstantMatCost(1, Op2Info);
2516
2517 unsigned Op;
2518 switch (ISDOpcode) {
2519 case ISD::ADD:
2520 case ISD::SUB:
2521 Op = RISCV::VADD_VV;
2522 break;
2523 case ISD::SHL:
2524 case ISD::SRL:
2525 case ISD::SRA:
2526 Op = RISCV::VSLL_VV;
2527 break;
2528 case ISD::AND:
2529 case ISD::OR:
2530 case ISD::XOR:
2531 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2532 break;
2533 case ISD::MUL:
2534 case ISD::MULHS:
2535 case ISD::MULHU:
2536 Op = RISCV::VMUL_VV;
2537 break;
2538 case ISD::SDIV:
2539 case ISD::UDIV:
2540 Op = RISCV::VDIV_VV;
2541 break;
2542 case ISD::SREM:
2543 case ISD::UREM:
2544 Op = RISCV::VREM_VV;
2545 break;
2546 case ISD::FADD:
2547 case ISD::FSUB:
2548 Op = RISCV::VFADD_VV;
2549 break;
2550 case ISD::FMUL:
2551 Op = RISCV::VFMUL_VV;
2552 break;
2553 case ISD::FDIV:
2554 Op = RISCV::VFDIV_VV;
2555 break;
2556 case ISD::FNEG:
2557 Op = RISCV::VFSGNJN_VV;
2558 break;
2559 default:
2560 // Assuming all other instructions have the same cost until a need arises to
2561 // differentiate them.
2562 return CastCost + ConstantMatCost +
2563 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2564 Args, CxtI);
2565 }
2566
2567 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2568 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2569 // ops are twice as expensive as integer ops. Do the same for vectors so
2570 // scalar floating point ops aren't cheaper than their vector equivalents.
2571 if (Ty->isFPOrFPVectorTy())
2572 InstrCost *= 2;
2573 return CastCost + ConstantMatCost + LT.first * InstrCost;
2574}
2575
2576// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2578 ArrayRef<const Value *> Ptrs, const Value *Base,
2579 const TTI::PointersChainInfo &Info, Type *AccessTy,
2582 // In the basic model we take into account GEP instructions only
2583 // (although here can come alloca instruction, a value, constants and/or
2584 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2585 // pointer). Typically, if Base is a not a GEP-instruction and all the
2586 // pointers are relative to the same base address, all the rest are
2587 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2588 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2589 // any their index is a non-const.
2590 // If no known dependencies between the pointers cost is calculated as a sum
2591 // of costs of GEP instructions.
2592 for (auto [I, V] : enumerate(Ptrs)) {
2593 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2594 if (!GEP)
2595 continue;
2596 if (Info.isSameBase() && V != Base) {
2597 if (GEP->hasAllConstantIndices())
2598 continue;
2599 // If the chain is unit-stride and BaseReg + stride*i is a legal
2600 // addressing mode, then presume the base GEP is sitting around in a
2601 // register somewhere and check if we can fold the offset relative to
2602 // it.
2603 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2604 if (Info.isUnitStride() &&
2605 isLegalAddressingMode(AccessTy,
2606 /* BaseGV */ nullptr,
2607 /* BaseOffset */ Stride * I,
2608 /* HasBaseReg */ true,
2609 /* Scale */ 0,
2610 GEP->getType()->getPointerAddressSpace()))
2611 continue;
2612 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2613 {TTI::OK_AnyValue, TTI::OP_None},
2614 {TTI::OK_AnyValue, TTI::OP_None}, {});
2615 } else {
2616 SmallVector<const Value *> Indices(GEP->indices());
2617 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2618 Indices, AccessTy, CostKind);
2619 }
2620 }
2621 return Cost;
2622}
2623
2626 OptimizationRemarkEmitter *ORE) const {
2627 // TODO: More tuning on benchmarks and metrics with changes as needed
2628 // would apply to all settings below to enable performance.
2629
2630
2631 if (ST->enableDefaultUnroll())
2632 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2633
2634 // Enable Upper bound unrolling universally, not dependent upon the conditions
2635 // below.
2636 UP.UpperBound = true;
2637
2638 // Disable loop unrolling for Oz and Os.
2639 UP.OptSizeThreshold = 0;
2641 if (L->getHeader()->getParent()->hasOptSize())
2642 return;
2643
2644 SmallVector<BasicBlock *, 4> ExitingBlocks;
2645 L->getExitingBlocks(ExitingBlocks);
2646 LLVM_DEBUG(dbgs() << "Loop has:\n"
2647 << "Blocks: " << L->getNumBlocks() << "\n"
2648 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2649
2650 // Only allow another exit other than the latch. This acts as an early exit
2651 // as it mirrors the profitability calculation of the runtime unroller.
2652 if (ExitingBlocks.size() > 2)
2653 return;
2654
2655 // Limit the CFG of the loop body for targets with a branch predictor.
2656 // Allowing 4 blocks permits if-then-else diamonds in the body.
2657 if (L->getNumBlocks() > 4)
2658 return;
2659
2660 // Scan the loop: don't unroll loops with calls as this could prevent
2661 // inlining. Don't unroll auto-vectorized loops either, though do allow
2662 // unrolling of the scalar remainder.
2663 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2665 for (auto *BB : L->getBlocks()) {
2666 for (auto &I : *BB) {
2667 // Both auto-vectorized loops and the scalar remainder have the
2668 // isvectorized attribute, so differentiate between them by the presence
2669 // of vector instructions.
2670 if (IsVectorized && I.getType()->isVectorTy())
2671 return;
2672
2673 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2674 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2675 if (!isLoweredToCall(F))
2676 continue;
2677 }
2678 return;
2679 }
2680
2681 SmallVector<const Value *> Operands(I.operand_values());
2684 }
2685 }
2686
2687 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2688
2689 UP.Partial = true;
2690 UP.Runtime = true;
2691 UP.UnrollRemainder = true;
2692 UP.UnrollAndJam = true;
2693
2694 // Force unrolling small loops can be very useful because of the branch
2695 // taken cost of the backedge.
2696 if (Cost < 12)
2697 UP.Force = true;
2698}
2699
2704
2706 MemIntrinsicInfo &Info) const {
2707 const DataLayout &DL = getDataLayout();
2708 Intrinsic::ID IID = Inst->getIntrinsicID();
2709 LLVMContext &C = Inst->getContext();
2710 bool HasMask = false;
2711 switch (IID) {
2712 case Intrinsic::riscv_vle_mask:
2713 case Intrinsic::riscv_vse_mask:
2714 HasMask = true;
2715 [[fallthrough]];
2716 case Intrinsic::riscv_vle:
2717 case Intrinsic::riscv_vse: {
2718 // Intrinsic interface:
2719 // riscv_vle(merge, ptr, vl)
2720 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2721 // riscv_vse(val, ptr, vl)
2722 // riscv_vse_mask(val, ptr, mask, vl, policy)
2723 bool IsWrite = Inst->getType()->isVoidTy();
2724 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2725 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2726 unsigned VLIndex = RVVIInfo->VLOperand;
2727 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
2728 MaybeAlign Alignment =
2729 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2730 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2731 Value *Mask = ConstantInt::getTrue(MaskType);
2732 if (HasMask)
2733 Mask = Inst->getArgOperand(VLIndex - 1);
2734 Value *EVL = Inst->getArgOperand(VLIndex);
2735 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2736 Alignment, Mask, EVL);
2737 return true;
2738 }
2739 case Intrinsic::riscv_vlse_mask:
2740 case Intrinsic::riscv_vsse_mask:
2741 HasMask = true;
2742 [[fallthrough]];
2743 case Intrinsic::riscv_vlse:
2744 case Intrinsic::riscv_vsse: {
2745 // Intrinsic interface:
2746 // riscv_vlse(merge, ptr, stride, vl)
2747 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
2748 // riscv_vsse(val, ptr, stride, vl)
2749 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
2750 bool IsWrite = Inst->getType()->isVoidTy();
2751 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2752 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2753 unsigned VLIndex = RVVIInfo->VLOperand;
2754 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
2755 MaybeAlign Alignment =
2756 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2757
2758 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
2759 // Use the pointer alignment as the element alignment if the stride is a
2760 // multiple of the pointer alignment. Otherwise, the element alignment
2761 // should be the greatest common divisor of pointer alignment and stride.
2762 // For simplicity, just consider unalignment for elements.
2763 unsigned PointerAlign = Alignment.valueOrOne().value();
2764 if (!isa<ConstantInt>(Stride) ||
2765 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
2766 Alignment = Align(1);
2767
2768 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2769 Value *Mask = ConstantInt::getTrue(MaskType);
2770 if (HasMask)
2771 Mask = Inst->getArgOperand(VLIndex - 1);
2772 Value *EVL = Inst->getArgOperand(VLIndex);
2773 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2774 Alignment, Mask, EVL, Stride);
2775 return true;
2776 }
2777 }
2778 return false;
2779}
2780
2782 if (Ty->isVectorTy()) {
2783 // f16 with only zvfhmin and bf16 will be promoted to f32
2784 Type *EltTy = cast<VectorType>(Ty)->getElementType();
2785 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
2786 EltTy->isBFloatTy())
2787 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
2788 cast<VectorType>(Ty));
2789
2790 TypeSize Size = DL.getTypeSizeInBits(Ty);
2791 if (Size.isScalable() && ST->hasVInstructions())
2792 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
2793
2794 if (ST->useRVVForFixedLengthVectors())
2795 return divideCeil(Size, ST->getRealMinVLen());
2796 }
2797
2798 return BaseT::getRegUsageForType(Ty);
2799}
2800
2801unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2802 if (SLPMaxVF.getNumOccurrences())
2803 return SLPMaxVF;
2804
2805 // Return how many elements can fit in getRegisterBitwidth. This is the
2806 // same routine as used in LoopVectorizer. We should probably be
2807 // accounting for whether we actually have instructions with the right
2808 // lane type, but we don't have enough information to do that without
2809 // some additional plumbing which hasn't been justified yet.
2810 TypeSize RegWidth =
2812 // If no vector registers, or absurd element widths, disable
2813 // vectorization by returning 1.
2814 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
2815}
2816
2820
2822 return ST->enableUnalignedVectorMem();
2823}
2824
2827 ScalarEvolution *SE) const {
2828 if (ST->hasVendorXCVmem() && !ST->is64Bit())
2829 return TTI::AMK_PostIndexed;
2830
2832}
2833
2835 const TargetTransformInfo::LSRCost &C2) const {
2836 // RISC-V specific here are "instruction number 1st priority".
2837 // If we need to emit adds inside the loop to add up base registers, then
2838 // we need at least one extra temporary register.
2839 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
2840 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
2841 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
2842 C1.NumIVMuls, C1.NumBaseAdds,
2843 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2844 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
2845 C2.NumIVMuls, C2.NumBaseAdds,
2846 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2847}
2848
2850 Align Alignment) const {
2851 auto *VTy = dyn_cast<VectorType>(DataTy);
2852 if (!VTy || VTy->isScalableTy())
2853 return false;
2854
2855 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2856 return false;
2857
2858 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
2859 // scalarize these types with LMUL >= maximum fixed-length LMUL.
2860 if (VTy->getElementType()->isIntegerTy(8))
2861 if (VTy->getElementCount().getFixedValue() > 256)
2862 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
2863 ST->getMaxLMULForFixedLengthVectors();
2864 return true;
2865}
2866
2868 Align Alignment) const {
2869 auto *VTy = dyn_cast<VectorType>(DataTy);
2870 if (!VTy || VTy->isScalableTy())
2871 return false;
2872
2873 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2874 return false;
2875 return true;
2876}
2877
2878/// See if \p I should be considered for address type promotion. We check if \p
2879/// I is a sext with right type and used in memory accesses. If it used in a
2880/// "complex" getelementptr, we allow it to be promoted without finding other
2881/// sext instructions that sign extended the same initial value. A getelementptr
2882/// is considered as "complex" if it has more than 2 operands.
2884 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
2885 bool Considerable = false;
2886 AllowPromotionWithoutCommonHeader = false;
2887 if (!isa<SExtInst>(&I))
2888 return false;
2889 Type *ConsideredSExtType =
2890 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2891 if (I.getType() != ConsideredSExtType)
2892 return false;
2893 // See if the sext is the one with the right type and used in at least one
2894 // GetElementPtrInst.
2895 for (const User *U : I.users()) {
2896 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2897 Considerable = true;
2898 // A getelementptr is considered as "complex" if it has more than 2
2899 // operands. We will promote a SExt used in such complex GEP as we
2900 // expect some computation to be merged if they are done on 64 bits.
2901 if (GEPInst->getNumOperands() > 2) {
2902 AllowPromotionWithoutCommonHeader = true;
2903 break;
2904 }
2905 }
2906 }
2907 return Considerable;
2908}
2909
2910bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
2911 switch (Opcode) {
2912 case Instruction::Add:
2913 case Instruction::Sub:
2914 case Instruction::Mul:
2915 case Instruction::And:
2916 case Instruction::Or:
2917 case Instruction::Xor:
2918 case Instruction::FAdd:
2919 case Instruction::FSub:
2920 case Instruction::FMul:
2921 case Instruction::FDiv:
2922 case Instruction::ICmp:
2923 case Instruction::FCmp:
2924 return true;
2925 case Instruction::Shl:
2926 case Instruction::LShr:
2927 case Instruction::AShr:
2928 case Instruction::UDiv:
2929 case Instruction::SDiv:
2930 case Instruction::URem:
2931 case Instruction::SRem:
2932 case Instruction::Select:
2933 return Operand == 1;
2934 default:
2935 return false;
2936 }
2937}
2938
2940 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2941 return false;
2942
2943 if (canSplatOperand(I->getOpcode(), Operand))
2944 return true;
2945
2946 auto *II = dyn_cast<IntrinsicInst>(I);
2947 if (!II)
2948 return false;
2949
2950 switch (II->getIntrinsicID()) {
2951 case Intrinsic::fma:
2952 case Intrinsic::vp_fma:
2953 case Intrinsic::fmuladd:
2954 case Intrinsic::vp_fmuladd:
2955 return Operand == 0 || Operand == 1;
2956 case Intrinsic::vp_shl:
2957 case Intrinsic::vp_lshr:
2958 case Intrinsic::vp_ashr:
2959 case Intrinsic::vp_udiv:
2960 case Intrinsic::vp_sdiv:
2961 case Intrinsic::vp_urem:
2962 case Intrinsic::vp_srem:
2963 case Intrinsic::ssub_sat:
2964 case Intrinsic::vp_ssub_sat:
2965 case Intrinsic::usub_sat:
2966 case Intrinsic::vp_usub_sat:
2967 case Intrinsic::vp_select:
2968 return Operand == 1;
2969 // These intrinsics are commutative.
2970 case Intrinsic::vp_add:
2971 case Intrinsic::vp_mul:
2972 case Intrinsic::vp_and:
2973 case Intrinsic::vp_or:
2974 case Intrinsic::vp_xor:
2975 case Intrinsic::vp_fadd:
2976 case Intrinsic::vp_fmul:
2977 case Intrinsic::vp_icmp:
2978 case Intrinsic::vp_fcmp:
2979 case Intrinsic::smin:
2980 case Intrinsic::vp_smin:
2981 case Intrinsic::umin:
2982 case Intrinsic::vp_umin:
2983 case Intrinsic::smax:
2984 case Intrinsic::vp_smax:
2985 case Intrinsic::umax:
2986 case Intrinsic::vp_umax:
2987 case Intrinsic::sadd_sat:
2988 case Intrinsic::vp_sadd_sat:
2989 case Intrinsic::uadd_sat:
2990 case Intrinsic::vp_uadd_sat:
2991 // These intrinsics have 'vr' versions.
2992 case Intrinsic::vp_sub:
2993 case Intrinsic::vp_fsub:
2994 case Intrinsic::vp_fdiv:
2995 return Operand == 0 || Operand == 1;
2996 default:
2997 return false;
2998 }
2999}
3000
3001/// Check if sinking \p I's operands to I's basic block is profitable, because
3002/// the operands can be folded into a target instruction, e.g.
3003/// splats of scalars can fold into vector instructions.
3006 using namespace llvm::PatternMatch;
3007
3008 if (I->isBitwiseLogicOp()) {
3009 if (!I->getType()->isVectorTy()) {
3010 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3011 for (auto &Op : I->operands()) {
3012 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3013 if (match(Op.get(), m_Not(m_Value()))) {
3014 Ops.push_back(&Op);
3015 return true;
3016 }
3017 }
3018 }
3019 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3020 for (auto &Op : I->operands()) {
3021 // (and X, (not Y)) -> (vandn.vv X, Y)
3022 if (match(Op.get(), m_Not(m_Value()))) {
3023 Ops.push_back(&Op);
3024 return true;
3025 }
3026 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3028 m_ZeroInt()),
3029 m_Value(), m_ZeroMask()))) {
3030 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3031 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3032 Ops.push_back(&Not);
3033 Ops.push_back(&InsertElt);
3034 Ops.push_back(&Op);
3035 return true;
3036 }
3037 }
3038 }
3039 }
3040
3041 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3042 return false;
3043
3044 // Don't sink splat operands if the target prefers it. Some targets requires
3045 // S2V transfer buffers and we can run out of them copying the same value
3046 // repeatedly.
3047 // FIXME: It could still be worth doing if it would improve vector register
3048 // pressure and prevent a vector spill.
3049 if (!ST->sinkSplatOperands())
3050 return false;
3051
3052 for (auto OpIdx : enumerate(I->operands())) {
3053 if (!canSplatOperand(I, OpIdx.index()))
3054 continue;
3055
3056 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3057 // Make sure we are not already sinking this operand
3058 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3059 continue;
3060
3061 // We are looking for a splat/vp.splat that can be sunk.
3063 m_Value(), m_Value(), m_Value()));
3064 if (!IsVPSplat &&
3066 m_Undef(), m_ZeroMask())))
3067 continue;
3068
3069 // Don't sink i1 splats.
3070 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3071 continue;
3072
3073 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3074 // and vector registers
3075 for (Use &U : Op->uses()) {
3076 Instruction *Insn = cast<Instruction>(U.getUser());
3077 if (!canSplatOperand(Insn, U.getOperandNo()))
3078 return false;
3079 }
3080
3081 // Sink any fpexts since they might be used in a widening fp pattern.
3082 if (IsVPSplat) {
3083 if (isa<FPExtInst>(Op->getOperand(0)))
3084 Ops.push_back(&Op->getOperandUse(0));
3085 } else {
3086 Use *InsertEltUse = &Op->getOperandUse(0);
3087 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3088 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3089 Ops.push_back(&InsertElt->getOperandUse(1));
3090 Ops.push_back(InsertEltUse);
3091 }
3092 Ops.push_back(&OpIdx.value());
3093 }
3094 return true;
3095}
3096
3098RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3100 // TODO: Enable expansion when unaligned access is not supported after we fix
3101 // issues in ExpandMemcmp.
3102 if (!ST->enableUnalignedScalarMem())
3103 return Options;
3104
3105 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3106 return Options;
3107
3108 Options.AllowOverlappingLoads = true;
3109 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3110 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3111 if (ST->is64Bit()) {
3112 Options.LoadSizes = {8, 4, 2, 1};
3113 Options.AllowedTailExpansions = {3, 5, 6};
3114 } else {
3115 Options.LoadSizes = {4, 2, 1};
3116 Options.AllowedTailExpansions = {3};
3117 }
3118
3119 if (IsZeroCmp && ST->hasVInstructions()) {
3120 unsigned VLenB = ST->getRealMinVLen() / 8;
3121 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3122 // `VLenB * MaxLMUL` so that it fits in a single register group.
3123 unsigned MinSize = ST->getXLen() / 8 + 1;
3124 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3125 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3126 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3127 }
3128 return Options;
3129}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
Value * getArgOperand(unsigned i) const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:681
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:695
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:693
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:683
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:692
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:689
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:690
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:687
@ ICMP_NE
not equal
Definition InstrTypes.h:700
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:694
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:691
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:680
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:688
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool noNaNs() const
Definition FMF.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:281
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:181
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
auto m_Undef()
Match an arbitrary undef constant.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:355
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1815
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2068
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:141
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).