Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXISelDAGToDAG.h"
18#include "NVPTXSubtarget.h"
19#include "NVPTXTargetMachine.h"
21#include "NVPTXUtilities.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/StringRef.h"
38#include "llvm/IR/Argument.h"
39#include "llvm/IR/Attributes.h"
40#include "llvm/IR/Constants.h"
41#include "llvm/IR/DataLayout.h"
44#include "llvm/IR/FPEnv.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalValue.h"
47#include "llvm/IR/IRBuilder.h"
48#include "llvm/IR/Instruction.h"
50#include "llvm/IR/IntrinsicsNVPTX.h"
51#include "llvm/IR/Module.h"
52#include "llvm/IR/Type.h"
53#include "llvm/IR/Value.h"
65#include <algorithm>
66#include <cassert>
67#include <cmath>
68#include <cstdint>
69#include <iterator>
70#include <optional>
71#include <string>
72#include <tuple>
73#include <utility>
74#include <vector>
75
76#define DEBUG_TYPE "nvptx-lower"
77
78using namespace llvm;
79
81 "nvptx-sched4reg",
82 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
83
85 "nvptx-fma-level", cl::Hidden,
86 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
87 " 1: do it 2: do it aggressively"),
88 cl::init(2));
89
91 "nvptx-prec-divf32", cl::Hidden,
93 "NVPTX Specific: Override the precision of the lowering for f32 fdiv"),
95 clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"),
96 clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
98 "Use IEEE Compliant F32 div.rnd if available (default)"),
100 "Use IEEE Compliant F32 div.rnd if available, no FTZ")),
102
104 "nvptx-prec-sqrtf32", cl::Hidden,
105 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
106 cl::init(true));
107
108/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
109/// does NOT use lg2.approx for log2, so this is disabled by default.
111 "nvptx-approx-log2f32",
112 cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
113 cl::init(false));
114
116 "nvptx-force-min-byval-param-align", cl::Hidden,
117 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
118 " params of device functions."),
119 cl::init(false));
120
123 const SDNode &N) const {
124 // If nvptx-prec-div32=N is used on the command-line, always honor it
125 if (UsePrecDivF32.getNumOccurrences() > 0)
126 return UsePrecDivF32;
127
128 const SDNodeFlags Flags = N.getFlags();
129 if (Flags.hasApproximateFuncs())
131
133}
134
136 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
137 if (UsePrecSqrtF32.getNumOccurrences() > 0)
138 return UsePrecSqrtF32;
139
140 if (N) {
141 const SDNodeFlags Flags = N->getFlags();
142 if (Flags.hasApproximateFuncs())
143 return false;
144 }
145
146 return true;
147}
148
153
154static bool IsPTXVectorType(MVT VT) {
155 switch (VT.SimpleTy) {
156 default:
157 return false;
158 case MVT::v2i1:
159 case MVT::v4i1:
160 case MVT::v2i8:
161 case MVT::v4i8:
162 case MVT::v8i8: // <2 x i8x4>
163 case MVT::v16i8: // <4 x i8x4>
164 case MVT::v2i16:
165 case MVT::v4i16:
166 case MVT::v8i16: // <4 x i16x2>
167 case MVT::v2i32:
168 case MVT::v4i32:
169 case MVT::v2i64:
170 case MVT::v2f16:
171 case MVT::v4f16:
172 case MVT::v8f16: // <4 x f16x2>
173 case MVT::v2bf16:
174 case MVT::v4bf16:
175 case MVT::v8bf16: // <4 x bf16x2>
176 case MVT::v2f32:
177 case MVT::v4f32:
178 case MVT::v2f64:
179 case MVT::v4i64:
180 case MVT::v4f64:
181 case MVT::v8i32:
182 case MVT::v8f32:
183 case MVT::v16f16: // <8 x f16x2>
184 case MVT::v16bf16: // <8 x bf16x2>
185 case MVT::v16i16: // <8 x i16x2>
186 case MVT::v32i8: // <8 x i8x4>
187 return true;
188 }
189}
190
191// When legalizing vector loads/stores, this function is called, which does two
192// things:
193// 1. Determines Whether the vector is something we want to custom lower,
194// std::nullopt is returned if we do not want to custom lower it.
195// 2. If we do want to handle it, returns two parameters:
196// - unsigned int NumElts - The number of elements in the final vector
197// - EVT EltVT - The type of the elements in the final vector
198static std::optional<std::pair<unsigned int, MVT>>
200 unsigned AddressSpace) {
201 const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
202
203 if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
204 VectorEVT.getSizeInBits() == 256)
205 return {{4, MVT::i64}};
206
207 if (!VectorEVT.isSimple())
208 return std::nullopt;
209 const MVT VectorVT = VectorEVT.getSimpleVT();
210
211 if (!VectorVT.isVector()) {
212 if (VectorVT == MVT::i128 || VectorVT == MVT::f128)
213 return {{2, MVT::i64}};
214 return std::nullopt;
215 }
216
217 const MVT EltVT = VectorVT.getVectorElementType();
218 const unsigned NumElts = VectorVT.getVectorNumElements();
219
220 // The size of the PTX virtual register that holds a packed type.
221 unsigned PackRegSize;
222
223 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
224 // legal. We can (and should) split that into 2 stores of <2 x double> here
225 // but I'm leaving that as a TODO for now.
226 switch (VectorVT.SimpleTy) {
227 default:
228 return std::nullopt;
229 case MVT::v4i64:
230 case MVT::v4f64:
231 case MVT::v8i32:
232 // This is a "native" vector type iff the address space is global
233 // and the target supports 256-bit loads/stores
234 if (!CanLowerTo256Bit)
235 return std::nullopt;
237 case MVT::v2i8:
238 case MVT::v2i32:
239 case MVT::v2i64:
240 case MVT::v2f64:
241 case MVT::v4i32:
242 // This is a "native" vector type
243 return std::pair(NumElts, EltVT);
244 case MVT::v16f16: // <8 x f16x2>
245 case MVT::v16bf16: // <8 x bf16x2>
246 case MVT::v16i16: // <8 x i16x2>
247 case MVT::v32i8: // <8 x i8x4>
248 // This can be upsized into a "native" vector type iff the address space is
249 // global and the target supports 256-bit loads/stores.
250 if (!CanLowerTo256Bit)
251 return std::nullopt;
253 case MVT::v2i16: // <1 x i16x2>
254 case MVT::v2f16: // <1 x f16x2>
255 case MVT::v2bf16: // <1 x bf16x2>
256 case MVT::v4i8: // <1 x i8x4>
257 case MVT::v4i16: // <2 x i16x2>
258 case MVT::v4f16: // <2 x f16x2>
259 case MVT::v4bf16: // <2 x bf16x2>
260 case MVT::v8i8: // <2 x i8x4>
261 case MVT::v8f16: // <4 x f16x2>
262 case MVT::v8bf16: // <4 x bf16x2>
263 case MVT::v8i16: // <4 x i16x2>
264 case MVT::v16i8: // <4 x i8x4>
265 PackRegSize = 32;
266 break;
267 case MVT::v8f32: // <4 x f32x2>
268 if (!CanLowerTo256Bit)
269 return std::nullopt;
271 case MVT::v2f32: // <1 x f32x2>
272 case MVT::v4f32: // <2 x f32x2>
273 if (!STI.hasF32x2Instructions())
274 return std::pair(NumElts, EltVT);
275 PackRegSize = 64;
276 break;
277 }
278
279 // If we reach here, then we can pack 2 or more elements into a single 32-bit
280 // or 64-bit PTX register and treat the vector as a new vector containing
281 // packed elements.
282
283 // Number of elements to pack in one word.
284 const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();
285
286 return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));
287}
288
289/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
290/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
291/// the types as required by the calling convention (with special handling for
292/// i8s).
293/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
294/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
295/// LowerCall, and LowerReturn.
296static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
297 LLVMContext &Ctx, CallingConv::ID CallConv,
298 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
300 uint64_t StartingOffset = 0) {
301 SmallVector<EVT, 16> TempVTs;
302 SmallVector<uint64_t, 16> TempOffsets;
303 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
304
305 for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
306 MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
307 unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
308
309 // Since we actually can load/store b8, we need to ensure that we'll use
310 // the original sized type for any i8s or i8 vectors.
311 if (VT.getScalarType() == MVT::i8) {
312 if (RegisterVT == MVT::i16)
313 RegisterVT = MVT::i8;
314 else if (RegisterVT == MVT::v2i16)
315 RegisterVT = MVT::v2i8;
316 else
317 assert(RegisterVT == MVT::v4i8 &&
318 "Expected v4i8, v2i16, or i16 for i8 RegisterVT");
319 }
320
321 // TODO: This is horribly incorrect for cases where the vector elements are
322 // not a multiple of bytes (ex i1) and legal or i8. However, this problem
323 // has existed for as long as NVPTX has and no one has complained, so we'll
324 // leave it for now.
325 for (unsigned I : seq(NumRegs)) {
326 ValueVTs.push_back(RegisterVT);
327 Offsets.push_back(Off + I * RegisterVT.getStoreSize());
328 }
329 }
330}
331
332// We return an EVT that can hold N VTs
333// If the VT is a vector, the resulting EVT is a flat vector with the same
334// element type as VT's element type.
335static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
336 if (N == 1)
337 return VT;
338
339 return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),
340 VT.getVectorNumElements() * N)
341 : EVT::getVectorVT(C, VT, N);
342}
343
345 const SDLoc &dl, SelectionDAG &DAG) {
346 if (V.getValueType() == VT) {
347 assert(I == 0 && "Index must be 0 for scalar value");
348 return V;
349 }
350
351 if (!VT.isVector())
352 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
353 DAG.getVectorIdxConstant(I, dl));
354
355 return DAG.getNode(
356 ISD::EXTRACT_SUBVECTOR, dl, VT, V,
358}
359
360template <typename T>
361static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,
362 SelectionDAG &DAG, T GetElement) {
363 if (N == 1)
364 return GetElement(0);
365
367 for (const unsigned I : llvm::seq(N)) {
368 SDValue Val = GetElement(I);
369 if (Val.getValueType().isVector())
370 DAG.ExtractVectorElements(Val, Values);
371 else
372 Values.push_back(Val);
373 }
374
375 EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
376 Values.size());
377 return DAG.getBuildVector(VT, dl, Values);
378}
379
380/// PromoteScalarIntegerPTX
381/// Used to make sure the arguments/returns are suitable for passing
382/// and promote them to a larger size if they're not.
383///
384/// The promoted type is placed in \p PromoteVT if the function returns true.
386 if (VT.isScalarInteger()) {
387 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
388 default:
390 "Promotion is not suitable for scalars of size larger than 64-bits");
391 case 1:
392 return MVT::i1;
393 case 2:
394 case 4:
395 case 8:
396 return MVT::i8;
397 case 16:
398 return MVT::i16;
399 case 32:
400 return MVT::i32;
401 case 64:
402 return MVT::i64;
403 }
404 }
405 return VT;
406}
407
408// Check whether we can merge loads/stores of some of the pieces of a
409// flattened function parameter or return value into a single vector
410// load/store.
411//
412// The flattened parameter is represented as a list of EVTs and
413// offsets, and the whole structure is aligned to ParamAlignment. This
414// function determines whether we can load/store pieces of the
415// parameter starting at index Idx using a single vectorized op of
416// size AccessSize. If so, it returns the number of param pieces
417// covered by the vector op. Otherwise, it returns 1.
418template <typename T>
420 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
421 const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
422
423 // Can't vectorize if param alignment is not sufficient.
424 if (ParamAlignment < AccessSize)
425 return 1;
426 // Can't vectorize if offset is not aligned.
427 if (Offsets[Idx] & (AccessSize - 1))
428 return 1;
429
430 EVT EltVT = ValueVTs[Idx];
431 unsigned EltSize = EltVT.getStoreSize();
432
433 // Element is too large to vectorize.
434 if (EltSize >= AccessSize)
435 return 1;
436
437 unsigned NumElts = AccessSize / EltSize;
438 // Can't vectorize if AccessBytes if not a multiple of EltSize.
439 if (AccessSize != EltSize * NumElts)
440 return 1;
441
442 // We don't have enough elements to vectorize.
443 if (Idx + NumElts > ValueVTs.size())
444 return 1;
445
446 // PTX ISA can only deal with 2- and 4-element vector ops.
447 if (NumElts != 4 && NumElts != 2)
448 return 1;
449
450 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
451 // Types do not match.
452 if (ValueVTs[j] != EltVT)
453 return 1;
454
455 // Elements are not contiguous.
456 if (Offsets[j] - Offsets[j - 1] != EltSize)
457 return 1;
458 }
459 // OK. We can vectorize ValueVTs[i..i+NumElts)
460 return NumElts;
461}
462
463// Computes whether and how we can vectorize the loads/stores of a
464// flattened function parameter or return value.
465//
466// The flattened parameter is represented as the list of ValueVTs and
467// Offsets, and is aligned to ParamAlignment bytes. We return a vector
468// of the same size as ValueVTs indicating how each piece should be
469// loaded/stored (i.e. as a scalar, or as part of a vector
470// load/store).
471template <typename T>
474 const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
475 bool IsVAArg = false) {
476 // Set vector size to match ValueVTs and mark all elements as
477 // scalars by default.
478
479 if (IsVAArg)
480 return SmallVector<unsigned>(ValueVTs.size(), 1);
481
482 SmallVector<unsigned, 16> VectorInfo;
483
484 const auto GetNumElts = [&](unsigned I) -> unsigned {
485 for (const unsigned AccessSize : {16, 8, 4, 2}) {
486 const unsigned NumElts = canMergeParamLoadStoresStartingAt(
487 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
488 assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
489 "Unexpected vectorization size");
490 if (NumElts != 1)
491 return NumElts;
492 }
493 return 1;
494 };
495
496 // Check what we can vectorize using 128/64/32-bit accesses.
497 for (unsigned I = 0, E = ValueVTs.size(); I != E;) {
498 const unsigned NumElts = GetNumElts(I);
499 VectorInfo.push_back(NumElts);
500 I += NumElts;
501 }
502 assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) ==
503 ValueVTs.size());
504 return VectorInfo;
505}
506
507// NVPTXTargetLowering Constructor.
509 const NVPTXSubtarget &STI)
510 : TargetLowering(TM), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) {
511 // always lower memset, memcpy, and memmove intrinsics to load/store
512 // instructions, rather
513 // then generating calls to memset, mempcy or memmove.
517
520
521 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
522 // condition branches.
523 setJumpIsExpensive(true);
524
525 // Wide divides are _very_ slow. Try to reduce the width of the divide if
526 // possible.
527 addBypassSlowDiv(64, 32);
528
529 // By default, use the Source scheduling
530 if (sched4reg)
532 else
534
535 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
536 LegalizeAction NoF16Action) {
537 bool IsOpSupported = STI.allowFP16Math();
538 switch (Op) {
539 // Several FP16 instructions are available on sm_80 only.
540 case ISD::FMINNUM:
541 case ISD::FMAXNUM:
542 case ISD::FMAXNUM_IEEE:
543 case ISD::FMINNUM_IEEE:
544 case ISD::FMAXIMUM:
545 case ISD::FMINIMUM:
546 case ISD::FMAXIMUMNUM:
547 case ISD::FMINIMUMNUM:
548 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
549 break;
550 case ISD::FEXP2:
551 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
552 break;
553 }
554 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
555 };
556
557 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
558 LegalizeAction NoBF16Action) {
559 bool IsOpSupported = STI.hasNativeBF16Support(Op);
561 Op, VT, IsOpSupported ? Action : NoBF16Action);
562 };
563
564 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
565 LegalizeAction NoI16x2Action) {
566 bool IsOpSupported = false;
567 // instructions are available on sm_90 only
568 switch (Op) {
569 case ISD::ADD:
570 case ISD::SMAX:
571 case ISD::SMIN:
572 case ISD::UMIN:
573 case ISD::UMAX:
574 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
575 break;
576 }
577 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
578 };
579
580 addRegisterClass(MVT::i1, &NVPTX::B1RegClass);
581 addRegisterClass(MVT::i16, &NVPTX::B16RegClass);
582 addRegisterClass(MVT::v2i16, &NVPTX::B32RegClass);
583 addRegisterClass(MVT::v4i8, &NVPTX::B32RegClass);
584 addRegisterClass(MVT::i32, &NVPTX::B32RegClass);
585 addRegisterClass(MVT::i64, &NVPTX::B64RegClass);
586 addRegisterClass(MVT::f32, &NVPTX::B32RegClass);
587 addRegisterClass(MVT::f64, &NVPTX::B64RegClass);
588 addRegisterClass(MVT::f16, &NVPTX::B16RegClass);
589 addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
590 addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
591 addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
592
593 if (STI.hasF32x2Instructions())
594 addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
595
596 // Conversion to/from FP16/FP16x2 is always legal.
601
602 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
603 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
604 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
605
606 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
607 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
608
609 // Conversion to/from BFP16/BFP16x2 is always legal.
614
615 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
616 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
617 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
618 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
619
620 // Conversion to/from i16/i16x2 is always legal.
625
630
631 // No support for these operations with v2f32.
634 // Need custom lowering in case the index is dynamic.
635 if (STI.hasF32x2Instructions())
637
638 // Custom conversions to/from v2i8.
639 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
640
641 // Only logical ops can be done on v4i8 directly, others must be done
642 // elementwise.
659 MVT::v4i8, Expand);
660
661 // Operations not directly supported by NVPTX.
662 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
663 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
664 MVT::v4i8, MVT::i32, MVT::i64}) {
666 setOperationAction(ISD::BR_CC, VT, Expand);
667 }
668
669 // Not directly supported. TLI would attempt to expand operations like
670 // FMINIMUM(v2f32) using invalid SETCC and VSELECT nodes.
672
673 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
674 // For others we will expand to a SHL/SRA pair.
681
688
691
693 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
694 Expand);
695
696 if (STI.hasHWROT32()) {
699 Custom);
700 }
701
703
704 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
705 setOperationAction(ISD::BRIND, MVT::Other, Expand);
706
707 // We want to legalize constant related memmove and memcopy
708 // intrinsics.
710
711 // FP extload/truncstore is not legal in PTX. We need to expand all these.
712 for (auto FloatVTs :
714 for (MVT ValVT : FloatVTs) {
715 for (MVT MemVT : FloatVTs) {
716 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
717 setTruncStoreAction(ValVT, MemVT, Expand);
718 }
719 }
720 }
721
722 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
723 // how they'll be lowered in ISel anyway, and by doing this a little earlier
724 // we allow for more DAG combine opportunities.
725 for (auto IntVTs :
727 for (MVT ValVT : IntVTs)
728 for (MVT MemVT : IntVTs)
729 if (isTypeLegal(ValVT))
730 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
731
732 // PTX does not support load / store predicate registers
733 setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
734 for (MVT VT : MVT::integer_valuetypes()) {
736 Promote);
737 setTruncStoreAction(VT, MVT::i1, Expand);
738 }
739
740 // Disable generations of extload/truncstore for v2i16/v2i8. The generic
741 // expansion for these nodes when they are unaligned is incorrect if the
742 // type is a vector.
743 //
744 // TODO: Fix the generic expansion for these nodes found in
745 // TargetLowering::expandUnalignedLoad/Store.
747 MVT::v2i8, Expand);
748 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
749
750 // Register custom handling for illegal type loads/stores. We'll try to custom
751 // lower almost all illegal types and logic in the lowering will discard cases
752 // we can't handle.
753 setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
755 if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
756 setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);
757
758 // Custom legalization for LDU intrinsics.
759 // TODO: The logic to lower these is not very robust and we should rewrite it.
760 // Perhaps LDU should not be represented as an intrinsic at all.
763 if (IsPTXVectorType(VT))
765
769 MVT::i1, Expand);
770
771 // This is legal in NVPTX
776
777 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
778 setOperationAction({ISD::STACKRESTORE, ISD::STACKSAVE}, MVT::Other, Custom);
779
780 // TRAP can be lowered to PTX trap
781 setOperationAction(ISD::TRAP, MVT::Other, Legal);
782 // DEBUGTRAP can be lowered to PTX brkpt
783 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
784
785 // Support varargs.
786 setOperationAction(ISD::VASTART, MVT::Other, Custom);
787 setOperationAction(ISD::VAARG, MVT::Other, Custom);
788 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
789 setOperationAction(ISD::VAEND, MVT::Other, Expand);
790
792 {MVT::i16, MVT::i32, MVT::i64}, Legal);
793
795 Promote);
798
799 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
800 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
801 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
802 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
803 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
804 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
805 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
806
807 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
808 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
809 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
810 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
811 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
812 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
813
814 // Other arithmetic and logic ops are unsupported.
818 MVT::v2i16, Expand);
819
824 if (STI.getPTXVersion() >= 43) {
829 }
830
832 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
835
836 // PTX does not directly support SELP of i1, so promote to i32 first
838
839 // PTX cannot multiply two i64s in a single instruction.
842
843 // We have some custom DAG combine patterns for these nodes
846 ISD::FADD, ISD::FMAXNUM, ISD::FMINNUM,
847 ISD::FMAXIMUM, ISD::FMINIMUM, ISD::FMAXIMUMNUM,
848 ISD::FMINIMUMNUM, ISD::MUL, ISD::SHL,
850 ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD,
851 ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND});
852
853 // setcc for f16x2 and bf16x2 needs special handling to prevent
854 // legalizer's attempt to scalarize it due to v2i1 not being legal.
855 if (STI.allowFP16Math() || STI.hasBF16Math())
857
858 // Vector reduction operations. These may be turned into shuffle or tree
859 // reductions depending on what instructions are available for each type.
861 MVT EltVT = VT.getVectorElementType();
862 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
863 setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
864 ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
865 VT, Custom);
866 }
867 }
868
869 // Promote fp16 arithmetic if fp16 hardware isn't available or the
870 // user passed --nvptx-no-fp16-math. The flag is useful because,
871 // although sm_53+ GPUs have some sort of FP16 support in
872 // hardware, only sm_53 and sm_60 have full implementation. Others
873 // only have token amount of hardware and are likely to run faster
874 // by using fp32 units instead.
875 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
876 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
877 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
878 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
879 // bf16 must be promoted to f32.
880 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
881 if (getOperationAction(Op, MVT::bf16) == Promote)
882 AddPromotedToType(Op, MVT::bf16, MVT::f32);
883 setOperationAction(Op, MVT::v2f32,
884 STI.hasF32x2Instructions() ? Legal : Expand);
885 }
886
887 // On SM80, we select add/mul/sub as fma to avoid promotion to float
888 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
889 for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
890 if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
892 }
893 }
894 }
895
896 // f16/f16x2 neg was introduced in PTX 60, SM_53.
897 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
898 STI.getPTXVersion() >= 60 &&
899 STI.allowFP16Math();
900 for (const auto &VT : {MVT::f16, MVT::v2f16})
901 setOperationAction(ISD::FNEG, VT,
902 IsFP16FP16x2NegAvailable ? Legal : Expand);
903
904 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
905 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
906 setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
907 // (would be) Library functions.
908
909 // These map to conversion instructions for scalar FP types.
910 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
911 ISD::FROUNDEVEN, ISD::FTRUNC}) {
912 setOperationAction(Op, MVT::f16, Legal);
913 setOperationAction(Op, MVT::f32, Legal);
914 setOperationAction(Op, MVT::f64, Legal);
915 setOperationAction(Op, MVT::v2f16, Expand);
916 setOperationAction(Op, MVT::v2bf16, Expand);
917 setOperationAction(Op, MVT::v2f32, Expand);
918 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
919 if (getOperationAction(Op, MVT::bf16) == Promote)
920 AddPromotedToType(Op, MVT::bf16, MVT::f32);
921 }
922
923 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
924 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
925 }
926 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
927 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
928 setOperationAction(ISD::FP_EXTEND, VT, Custom);
930 }
931 }
932
933 // Expand v2f32 = fp_extend
934 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
935 // Expand v2[b]f16 = fp_round v2f32
936 setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);
937
938 // sm_80 only has conversions between f32 and bf16. Custom lower all other
939 // bf16 conversions.
940 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
941 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
944 VT, Custom);
945 }
948 MVT::bf16, Custom);
949 }
950
951 setOperationAction(ISD::FROUND, MVT::f16, Promote);
952 setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
953 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
954 setOperationAction(ISD::FROUND, MVT::f32, Custom);
955 setOperationAction(ISD::FROUND, MVT::f64, Custom);
956 setOperationAction(ISD::FROUND, MVT::bf16, Promote);
957 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
958
959 // 'Expand' implements FCOPYSIGN without calling an external library.
966
967 // These map to corresponding instructions for f32/f64. f16 must be
968 // promoted to f32. v2f16 is expanded to f16, which is then promoted
969 // to f32.
970 for (const auto &Op :
971 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
972 setOperationAction(Op, MVT::f16, Promote);
973 setOperationAction(Op, MVT::f32, Legal);
974 // only div/rem/sqrt are legal for f64
975 if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
976 setOperationAction(Op, MVT::f64, Legal);
977 }
978 setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
979 setOperationAction(Op, MVT::bf16, Promote);
980 AddPromotedToType(Op, MVT::bf16, MVT::f32);
981 }
982 setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);
983
984 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
985 setOperationAction(ISD::FABS, MVT::v2f32, Expand);
986 if (STI.getPTXVersion() >= 65) {
987 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
988 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
989 } else {
990 setOperationAction(ISD::FABS, MVT::f16, Promote);
991 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
992 }
993 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
994 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
995 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
996 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
997
998 for (const auto &Op :
999 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) {
1000 setOperationAction(Op, MVT::f32, Legal);
1001 setOperationAction(Op, MVT::f64, Legal);
1002 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
1003 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1004 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1005 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
1006 if (getOperationAction(Op, MVT::bf16) == Promote)
1007 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1008 setOperationAction(Op, MVT::v2f32, Expand);
1009 }
1010 bool SupportsF32MinMaxNaN =
1011 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
1012 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
1013 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
1014 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
1015 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1016 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
1017 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1018 setOperationAction(Op, MVT::v2f32, Expand);
1019 }
1020
1021 // Custom lowering for inline asm with 128-bit operands
1024
1025 // FEXP2 support:
1026 // - f32
1027 // - f16/f16x2 (sm_70+, PTX 7.0+)
1028 // - bf16/bf16x2 (sm_90+, PTX 7.8+)
1029 // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
1030 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
1031 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
1032 setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
1033 setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
1034 setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
1035 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
1036
1037 // FLOG2 supports f32 only
1038 // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
1039 if (UseApproxLog2F32) {
1040 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
1041 setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
1042 setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
1043 setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
1044 Expand);
1045 }
1046
1047 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
1048
1049 setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
1050
1051 // atom.b128 is legal in PTX but since we don't represent i128 as a legal
1052 // type, we need to custom lower it.
1053 setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128,
1054 Custom);
1055
1056 // Now deduce the information based on the above mentioned
1057 // actions
1058 computeRegisterProperties(STI.getRegisterInfo());
1059
1060 // PTX support for 16-bit CAS is emulated. Only use 32+
1061 setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
1062 setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
1064
1065 // Custom lowering for tcgen05.ld vector operands
1067 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1068 MVT::v32i32, MVT::v64i32, MVT::v128i32},
1069 Custom);
1070
1071 // Custom lowering for tcgen05.st vector operands
1073 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1074 MVT::v32i32, MVT::v64i32, MVT::v128i32},
1075 Custom);
1076
1077 // Enable custom lowering for the following:
1078 // * MVT::i128 - clusterlaunchcontrol
1079 // * MVT::i32 - prmt
1080 // * MVT::Other - internal.addrspace.wrap
1081 setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other},
1082 Custom);
1083}
1084
1085const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
1086
1087#define MAKE_CASE(V) \
1088 case V: \
1089 return #V;
1090
1091 switch ((NVPTXISD::NodeType)Opcode) {
1093 break;
1094
1147 MAKE_CASE(
1149 MAKE_CASE(
1161 MAKE_CASE(
1163 MAKE_CASE(
1165 }
1166 return nullptr;
1167
1168#undef MAKE_CASE
1169}
1170
1173 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1174 VT.getScalarType() == MVT::i1)
1175 return TypeSplitVector;
1177}
1178
1180 int Enabled, int &ExtraSteps,
1181 bool &UseOneConst,
1182 bool Reciprocal) const {
1185 return SDValue();
1186
1187 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1188 ExtraSteps = 0;
1189
1190 SDLoc DL(Operand);
1191 EVT VT = Operand.getValueType();
1192 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1193
1194 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1195 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1196 DAG.getConstant(IID, DL, MVT::i32), Operand);
1197 };
1198
1199 // The sqrt and rsqrt refinement processes assume we always start out with an
1200 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1201 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1202 // any refinement, we must return a regular sqrt.
1203 if (Reciprocal || ExtraSteps > 0) {
1204 if (VT == MVT::f32)
1205 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1206 : Intrinsic::nvvm_rsqrt_approx_f);
1207 else if (VT == MVT::f64)
1208 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1209 else
1210 return SDValue();
1211 } else {
1212 if (VT == MVT::f32)
1213 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1214 : Intrinsic::nvvm_sqrt_approx_f);
1215 else {
1216 // There's no sqrt.approx.f64 instruction, so we emit
1217 // reciprocal(rsqrt(x)). This is faster than
1218 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1219 // x * rsqrt(x).)
1220 return DAG.getNode(
1222 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1223 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1224 }
1225 }
1226}
1227
1229 const DataLayout &DL, Type *RetTy, const ArgListTy &Args,
1231 std::optional<unsigned> FirstVAArg, const CallBase &CB,
1232 unsigned UniqueCallSite) const {
1233 auto PtrVT = getPointerTy(DL);
1234
1235 std::string Prototype;
1236 raw_string_ostream O(Prototype);
1237 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1238
1239 if (RetTy->isVoidTy()) {
1240 O << "()";
1241 } else {
1242 O << "(";
1243 if (shouldPassAsArray(RetTy)) {
1244 const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);
1245 O << ".param .align " << RetAlign.value() << " .b8 _["
1246 << DL.getTypeAllocSize(RetTy) << "]";
1247 } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {
1248 unsigned size = 0;
1249 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1250 size = ITy->getBitWidth();
1251 } else {
1252 assert(RetTy->isFloatingPointTy() &&
1253 "Floating point type expected here");
1254 size = RetTy->getPrimitiveSizeInBits();
1255 }
1256 // PTX ABI requires all scalar return values to be at least 32
1257 // bits in size. fp16 normally uses .b16 as its storage type in
1258 // PTX, so its size must be adjusted here, too.
1260
1261 O << ".param .b" << size << " _";
1262 } else if (isa<PointerType>(RetTy)) {
1263 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1264 } else {
1265 llvm_unreachable("Unknown return type");
1266 }
1267 O << ") ";
1268 }
1269 O << "_ (";
1270
1271 bool first = true;
1272
1273 const unsigned NumArgs = FirstVAArg.value_or(Args.size());
1274 auto AllOuts = ArrayRef(Outs);
1275 for (const unsigned I : llvm::seq(NumArgs)) {
1276 const auto ArgOuts =
1277 AllOuts.take_while([I](auto O) { return O.OrigArgIndex == I; });
1278 AllOuts = AllOuts.drop_front(ArgOuts.size());
1279
1280 Type *Ty = Args[I].Ty;
1281 if (!first) {
1282 O << ", ";
1283 }
1284 first = false;
1285
1286 if (ArgOuts[0].Flags.isByVal()) {
1287 // Indirect calls need strict ABI alignment so we disable optimizations by
1288 // not providing a function to optimize.
1289 Type *ETy = Args[I].IndirectType;
1290 Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1291 Align ParamByValAlign =
1292 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1293
1294 O << ".param .align " << ParamByValAlign.value() << " .b8 _["
1295 << ArgOuts[0].Flags.getByValSize() << "]";
1296 } else {
1297 if (shouldPassAsArray(Ty)) {
1298 Align ParamAlign =
1299 getArgumentAlignment(&CB, Ty, I + AttributeList::FirstArgIndex, DL);
1300 O << ".param .align " << ParamAlign.value() << " .b8 _["
1301 << DL.getTypeAllocSize(Ty) << "]";
1302 continue;
1303 }
1304 // i8 types in IR will be i16 types in SDAG
1305 assert((getValueType(DL, Ty) == ArgOuts[0].VT ||
1306 (getValueType(DL, Ty) == MVT::i8 && ArgOuts[0].VT == MVT::i16)) &&
1307 "type mismatch between callee prototype and arguments");
1308 // scalar type
1309 unsigned sz = 0;
1310 if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
1311 sz = promoteScalarArgumentSize(ITy->getBitWidth());
1312 } else if (isa<PointerType>(Ty)) {
1313 sz = PtrVT.getSizeInBits();
1314 } else {
1315 sz = Ty->getPrimitiveSizeInBits();
1316 }
1317 O << ".param .b" << sz << " _";
1318 }
1319 }
1320
1321 if (FirstVAArg)
1322 O << (first ? "" : ",") << " .param .align "
1323 << STI.getMaxRequiredAlignment() << " .b8 _[]";
1324 O << ")";
1325 if (shouldEmitPTXNoReturn(&CB, *nvTM))
1326 O << " .noreturn";
1327 O << ";";
1328
1329 return Prototype;
1330}
1331
1333 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1334 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1335}
1336
1337Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1338 unsigned Idx,
1339 const DataLayout &DL) const {
1340 if (!CB) {
1341 // CallSite is zero, fallback to ABI type alignment
1342 return DL.getABITypeAlign(Ty);
1343 }
1344
1345 const Function *DirectCallee = CB->getCalledFunction();
1346
1347 if (!DirectCallee) {
1348 // We don't have a direct function symbol, but that may be because of
1349 // constant cast instructions in the call.
1350
1351 // With bitcast'd call targets, the instruction will be the call
1352 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1353 // Check if we have call alignment metadata
1354 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1355 return StackAlign.value();
1356 }
1357 DirectCallee = getMaybeBitcastedCallee(CB);
1358 }
1359
1360 // Check for function alignment information if we found that the
1361 // ultimate target is a Function
1362 if (DirectCallee)
1363 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1364
1365 // Call is indirect, fall back to the ABI type alignment
1366 return DL.getABITypeAlign(Ty);
1367}
1368
1370 const GlobalAddressSDNode *Func) {
1371 if (!Func)
1372 return false;
1373 if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1374 return CB->getFunctionType() != CalleeFunc->getFunctionType();
1375 return false;
1376}
1377
1379 const DataLayout &DL,
1380 const TargetLowering &TL) {
1381 if (Ptr->getOpcode() == ISD::FrameIndex) {
1382 auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);
1385
1387 }
1388
1389 // Peel of an addrspacecast to generic and load directly from the specific
1390 // address space.
1391 if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
1392 const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
1393 if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
1394 Ptr = ASC->getOperand(0);
1395 return MachinePointerInfo(ASC->getSrcAddressSpace());
1396 }
1397 }
1398
1399 return MachinePointerInfo();
1400}
1401
1403 if (Flags.isSExt())
1404 return ISD::SIGN_EXTEND;
1405 if (Flags.isZExt())
1406 return ISD::ZERO_EXTEND;
1407 return ISD::ANY_EXTEND;
1408}
1409
1411 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1412 SDLoc dl) {
1413 const EVT ActualVT = V.getValueType();
1414 assert((ActualVT == ExpectedVT ||
1415 (ExpectedVT.isInteger() && ActualVT.isInteger())) &&
1416 "Non-integer argument type size mismatch");
1417 if (ExpectedVT.bitsGT(ActualVT))
1418 return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
1419 if (ExpectedVT.bitsLT(ActualVT))
1420 return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
1421
1422 return V;
1423}
1424
1426 SmallVectorImpl<SDValue> &InVals) const {
1427
1428 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1430 "Support for variadic functions (unsized array parameter) introduced "
1431 "in PTX ISA version 6.0 and requires target sm_30.");
1432
1433 SelectionDAG &DAG = CLI.DAG;
1434 SDLoc dl = CLI.DL;
1435 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1436 SDValue Callee = CLI.Callee;
1437 ArgListTy &Args = CLI.getArgs();
1438 Type *RetTy = CLI.RetTy;
1439 const CallBase *CB = CLI.CB;
1440 const DataLayout &DL = DAG.getDataLayout();
1441 LLVMContext &Ctx = *DAG.getContext();
1442
1443 const auto GetI32 = [&](const unsigned I) {
1444 return DAG.getConstant(I, dl, MVT::i32);
1445 };
1446
1447 const unsigned UniqueCallSite = GlobalUniqueCallSite++;
1448 const SDValue CallChain = CLI.Chain;
1449 const SDValue StartChain =
1450 DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
1451 SDValue DeclareGlue = StartChain.getValue(1);
1452
1453 SmallVector<SDValue, 16> CallPrereqs{StartChain};
1454
1455 const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
1456 // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
1457 // loaded/stored using i16, so it's handled here as well.
1458 const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
1459 SDValue Declare =
1460 DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
1461 {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
1462 CallPrereqs.push_back(Declare);
1463 DeclareGlue = Declare.getValue(1);
1464 return Declare;
1465 };
1466
1467 const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
1468 unsigned Size) {
1469 SDValue Declare = DAG.getNode(
1470 NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
1471 {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
1472 CallPrereqs.push_back(Declare);
1473 DeclareGlue = Declare.getValue(1);
1474 return Declare;
1475 };
1476
1477 // Variadic arguments.
1478 //
1479 // Normally, for each argument, we declare a param scalar or a param
1480 // byte array in the .param space, and store the argument value to that
1481 // param scalar or array starting at offset 0.
1482 //
1483 // In the case of the first variadic argument, we declare a vararg byte array
1484 // with size 0. The exact size of this array isn't known at this point, so
1485 // it'll be patched later. All the variadic arguments will be stored to this
1486 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1487 // initially set to 0, so it can be used for non-variadic arguments (which use
1488 // 0 offset) to simplify the code.
1489 //
1490 // After all vararg is processed, 'VAOffset' holds the size of the
1491 // vararg byte array.
1492 assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
1493 "Non-VarArg function with extra arguments");
1494
1495 const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
1496 unsigned VAOffset = 0; // current offset in the param array
1497
1498 const SDValue VADeclareParam =
1499 CLI.Args.size() > FirstVAArg
1500 ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
1501 Align(STI.getMaxRequiredAlignment()), 0)
1502 : SDValue();
1503
1504 // Args.size() and Outs.size() need not match.
1505 // Outs.size() will be larger
1506 // * if there is an aggregate argument with multiple fields (each field
1507 // showing up separately in Outs)
1508 // * if there is a vector argument with more than typical vector-length
1509 // elements (generally if more than 4) where each vector element is
1510 // individually present in Outs.
1511 // So a different index should be used for indexing into Outs/OutVals.
1512 // See similar issue in LowerFormalArguments.
1513 auto AllOuts = ArrayRef(CLI.Outs);
1514 auto AllOutVals = ArrayRef(CLI.OutVals);
1515 assert(AllOuts.size() == AllOutVals.size() &&
1516 "Outs and OutVals must be the same size");
1517 // Declare the .params or .reg need to pass values
1518 // to the function
1519 for (const auto E : llvm::enumerate(Args)) {
1520 const auto ArgI = E.index();
1521 const auto Arg = E.value();
1522 const auto ArgOuts =
1523 AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
1524 const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
1525 AllOuts = AllOuts.drop_front(ArgOuts.size());
1526 AllOutVals = AllOutVals.drop_front(ArgOuts.size());
1527
1528 const bool IsVAArg = (ArgI >= FirstVAArg);
1529 const bool IsByVal = Arg.IsByVal;
1530
1531 const SDValue ParamSymbol =
1532 getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
1533
1534 assert((!IsByVal || Arg.IndirectType) &&
1535 "byval arg must have indirect type");
1536 Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
1537
1538 const Align ArgAlign = [&]() {
1539 if (IsByVal) {
1540 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1541 // so we don't need to worry whether it's naturally aligned or not.
1542 // See TargetLowering::LowerCallTo().
1543 const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1545 InitialAlign, DL);
1546 }
1547 return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
1548 }();
1549
1550 const unsigned TySize = DL.getTypeAllocSize(ETy);
1551 assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
1552 "type size mismatch");
1553
1554 const SDValue ArgDeclare = [&]() {
1555 if (IsVAArg)
1556 return VADeclareParam;
1557
1558 if (IsByVal || shouldPassAsArray(Arg.Ty))
1559 return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
1560
1561 assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
1562 assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
1563 "Only int and float types are supported as non-array arguments");
1564
1565 return MakeDeclareScalarParam(ParamSymbol, TySize);
1566 }();
1567
1568 if (IsByVal) {
1569 assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
1570 SDValue SrcPtr = ArgOutVals[0];
1571 const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
1572 const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1573
1574 if (IsVAArg)
1575 VAOffset = alignTo(VAOffset, ArgAlign);
1576
1577 SmallVector<EVT, 4> ValueVTs, MemVTs;
1579 ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
1580
1581 unsigned J = 0;
1582 const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
1583 for (const unsigned NumElts : VI) {
1584 EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);
1585 Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);
1586 SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
1587 SDValue SrcLoad =
1588 DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);
1589
1590 TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);
1591 Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);
1592 SDValue ParamAddr =
1593 DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);
1594 SDValue StoreParam =
1595 DAG.getStore(ArgDeclare, dl, SrcLoad, ParamAddr,
1597 CallPrereqs.push_back(StoreParam);
1598
1599 J += NumElts;
1600 }
1601 if (IsVAArg)
1602 VAOffset += TySize;
1603 } else {
1606 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
1607 VAOffset);
1608 assert(VTs.size() == Offsets.size() && "Size mismatch");
1609 assert(VTs.size() == ArgOuts.size() && "Size mismatch");
1610
1611 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1612 // than 32-bits are sign extended or zero extended, depending on
1613 // whether they are signed or unsigned types. This case applies
1614 // only to scalar parameters and not to aggregate values.
1615 const bool ExtendIntegerParam =
1616 Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
1617
1618 const auto GetStoredValue = [&](const unsigned I) {
1619 SDValue StVal = ArgOutVals[I];
1621 StVal.getValueType() &&
1622 "OutVal type should always be legal");
1623
1624 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1625 const EVT StoreVT =
1626 ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1627
1628 return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
1629 };
1630
1631 unsigned J = 0;
1632 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1633 for (const unsigned NumElts : VI) {
1634 const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
1635
1636 unsigned Offset;
1637 if (IsVAArg) {
1638 // TODO: We may need to support vector types that can be passed
1639 // as scalars in variadic arguments.
1640 assert(NumElts == 1 &&
1641 "Vectorization should be disabled for vaargs.");
1642
1643 // Align each part of the variadic argument to their type.
1644 VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
1645 Offset = VAOffset;
1646
1647 const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1648 VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));
1649 } else {
1650 assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
1651 Offset = Offsets[J];
1652 }
1653
1654 SDValue Ptr =
1655 DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
1656
1657 const MaybeAlign CurrentAlign = ExtendIntegerParam
1658 ? MaybeAlign(std::nullopt)
1659 : commonAlignment(ArgAlign, Offset);
1660
1661 SDValue Val =
1662 getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {
1663 return GetStoredValue(J + K);
1664 });
1665
1666 SDValue StoreParam =
1667 DAG.getStore(ArgDeclare, dl, Val, Ptr,
1669 CallPrereqs.push_back(StoreParam);
1670
1671 J += NumElts;
1672 }
1673 }
1674 }
1675
1676 // Handle Result
1677 if (!Ins.empty()) {
1678 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1679 const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
1680 if (shouldPassAsArray(RetTy)) {
1681 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1682 MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
1683 } else {
1684 MakeDeclareScalarParam(RetSymbol, ResultSize);
1685 }
1686 }
1687
1688 // Set the size of the vararg param byte array if the callee is a variadic
1689 // function and the variadic part is not empty.
1690 if (VADeclareParam) {
1691 SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
1692 VADeclareParam.getOperand(1),
1693 VADeclareParam.getOperand(2), GetI32(VAOffset),
1694 VADeclareParam.getOperand(4)};
1695 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1696 VADeclareParam->getVTList(), DeclareParamOps);
1697 }
1698
1699 const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1700 // If the type of the callsite does not match that of the function, convert
1701 // the callsite to an indirect call.
1702 const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
1703
1704 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1705 // between them we must rely on the call site value which is valid for
1706 // indirect calls but is always null for libcalls.
1707 const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1708
1709 if (isa<ExternalSymbolSDNode>(Callee)) {
1710 Function* CalleeFunc = nullptr;
1711
1712 // Try to find the callee in the current module.
1713 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1714 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1715
1716 // Set the "libcall callee" attribute to indicate that the function
1717 // must always have a declaration.
1718 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1719 }
1720
1721 if (IsIndirectCall) {
1722 // This is indirect function call case : PTX requires a prototype of the
1723 // form
1724 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1725 // to be emitted, and the label has to used as the last arg of call
1726 // instruction.
1727 // The prototype is embedded in a string and put as the operand for a
1728 // CallPrototype SDNode which will print out to the value of the string.
1729 const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1730 std::string Proto =
1731 getPrototype(DL, RetTy, Args, CLI.Outs,
1732 HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
1733 UniqueCallSite);
1734 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1735 const SDValue PrototypeDeclare = DAG.getNode(
1736 NVPTXISD::CallPrototype, dl, MVT::Other,
1737 {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
1738 CallPrereqs.push_back(PrototypeDeclare);
1739 }
1740
1741 const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
1742 const unsigned NumArgs =
1743 std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
1744 /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
1745 /// NumParams, Callee, Proto)
1746 const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
1747 const SDValue Call = DAG.getNode(
1748 NVPTXISD::CALL, dl, MVT::Other,
1749 {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
1750 GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
1751
1752 SmallVector<SDValue, 16> LoadChains{Call};
1753 SmallVector<SDValue, 16> ProxyRegOps;
1754 if (!Ins.empty()) {
1757 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
1758 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1759
1760 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1761 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1762
1763 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1764 // 32-bits are sign extended or zero extended, depending on whether
1765 // they are signed or unsigned types.
1766 const bool ExtendIntegerRetVal =
1767 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1768
1769 unsigned I = 0;
1770 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1771 for (const unsigned NumElts : VI) {
1772 const MaybeAlign CurrentAlign =
1773 ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
1774 : commonAlignment(RetAlign, Offsets[I]);
1775
1776 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1777 const EVT LoadVT =
1778 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1779 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
1780 SDValue Ptr =
1781 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
1782
1783 SDValue R =
1784 DAG.getLoad(VecVT, dl, Call, Ptr,
1786
1787 LoadChains.push_back(R.getValue(1));
1788 for (const unsigned J : llvm::seq(NumElts))
1789 ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
1790 I += NumElts;
1791 }
1792 }
1793
1794 const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
1795 const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
1796 UniqueCallSite + 1, SDValue(), dl);
1797
1798 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1799 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1800 // dangling.
1801 for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
1802 SDValue Proxy =
1803 DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
1804 SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
1805 InVals.push_back(Ret);
1806 }
1807
1808 // set IsTailCall to false for now, until we figure out how to express
1809 // tail call optimization in PTX
1810 CLI.IsTailCall = false;
1811 return CallEnd;
1812}
1813
1815 SelectionDAG &DAG) const {
1816
1817 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1818 const Function &Fn = DAG.getMachineFunction().getFunction();
1819
1821 Fn,
1822 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
1823 "requires target sm_52.",
1824 SDLoc(Op).getDebugLoc()));
1825 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
1826 Op.getOperand(0)};
1827 return DAG.getMergeValues(Ops, SDLoc());
1828 }
1829
1830 SDLoc DL(Op.getNode());
1831 SDValue Chain = Op.getOperand(0);
1832 SDValue Size = Op.getOperand(1);
1833 uint64_t Align = Op.getConstantOperandVal(2);
1834
1835 // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
1836 // the default stack alignment should be used.
1837 if (Align == 0)
1839
1840 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
1841 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1842
1843 SDValue Alloc =
1844 DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
1845 {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
1846 DAG.getTargetConstant(Align, DL, MVT::i32)});
1847
1848 SDValue ASC = DAG.getAddrSpaceCast(
1850
1851 return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
1852}
1853
1855 SelectionDAG &DAG) const {
1856 SDLoc DL(Op.getNode());
1857 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1858 const Function &Fn = DAG.getMachineFunction().getFunction();
1859
1861 Fn,
1862 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
1863 ">= sm_52.",
1864 DL.getDebugLoc()));
1865 return Op.getOperand(0);
1866 }
1867
1868 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1869 SDValue Chain = Op.getOperand(0);
1870 SDValue Ptr = Op.getOperand(1);
1873 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
1874}
1875
1877 SelectionDAG &DAG) const {
1878 SDLoc DL(Op.getNode());
1879 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1880 const Function &Fn = DAG.getMachineFunction().getFunction();
1881
1883 Fn,
1884 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
1885 "sm_52.",
1886 DL.getDebugLoc()));
1887 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
1888 return DAG.getMergeValues(Ops, DL);
1889 }
1890
1891 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1892 SDValue Chain = Op.getOperand(0);
1893 SDValue SS =
1894 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
1895 SDValue ASC = DAG.getAddrSpaceCast(
1896 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
1897 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
1898}
1899
1900// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1901// (see LegalizeDAG.cpp). This is slow and uses local memory.
1902// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1903SDValue
1904NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1905 SDNode *Node = Op.getNode();
1906 SDLoc dl(Node);
1908 unsigned NumOperands = Node->getNumOperands();
1909 for (unsigned i = 0; i < NumOperands; ++i) {
1910 SDValue SubOp = Node->getOperand(i);
1911 EVT VVT = SubOp.getNode()->getValueType(0);
1912 EVT EltVT = VVT.getVectorElementType();
1913 unsigned NumSubElem = VVT.getVectorNumElements();
1914 for (unsigned j = 0; j < NumSubElem; ++j) {
1915 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1916 DAG.getIntPtrConstant(j, dl)));
1917 }
1918 }
1919 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1920}
1921
1923 SelectionDAG &DAG,
1924 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1925 assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
1926 Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
1927 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
1928 {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
1929}
1930
1932 SelectionDAG &DAG,
1933 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1934 return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
1935}
1936
1937/// Reduces the elements using the scalar operations provided. The operations
1938/// are sorted descending in number of inputs they take. The flags on the
1939/// original reduction operation will be propagated to each scalar operation.
1940/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
1941/// used in ExpandReductions and SelectionDAG.
1943 const SmallVector<SDValue> &Elements, EVT EltTy,
1944 ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
1945 const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
1946 // Build the reduction tree at each level, starting with all the elements.
1947 SmallVector<SDValue> Level = Elements;
1948
1949 unsigned OpIdx = 0;
1950 while (Level.size() > 1) {
1951 // Try to reduce this level using the current operator.
1952 const auto [Op, NumInputs] = Ops[OpIdx];
1953
1954 // Build the next level by partially reducing all elements.
1955 SmallVector<SDValue> ReducedLevel;
1956 unsigned I = 0, E = Level.size();
1957 for (; I + NumInputs <= E; I += NumInputs) {
1958 // Reduce elements in groups of [NumInputs], as much as possible.
1959 ReducedLevel.push_back(DAG.getNode(
1960 Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
1961 }
1962
1963 if (I < E) {
1964 // Handle leftover elements.
1965
1966 if (ReducedLevel.empty()) {
1967 // We didn't reduce anything at this level. We need to pick a smaller
1968 // operator.
1969 ++OpIdx;
1970 assert(OpIdx < Ops.size() && "no smaller operators for reduction");
1971 continue;
1972 }
1973
1974 // We reduced some things but there's still more left, meaning the
1975 // operator's number of inputs doesn't evenly divide this level size. Move
1976 // these elements to the next level.
1977 for (; I < E; ++I)
1978 ReducedLevel.push_back(Level[I]);
1979 }
1980
1981 // Process the next level.
1982 Level = ReducedLevel;
1983 }
1984
1985 return *Level.begin();
1986}
1987
1988// Get scalar reduction opcode
1989static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
1990 switch (ReductionOpcode) {
1991 case ISD::VECREDUCE_FMAX:
1992 return ISD::FMAXNUM;
1993 case ISD::VECREDUCE_FMIN:
1994 return ISD::FMINNUM;
1995 case ISD::VECREDUCE_FMAXIMUM:
1996 return ISD::FMAXIMUM;
1997 case ISD::VECREDUCE_FMINIMUM:
1998 return ISD::FMINIMUM;
1999 default:
2000 llvm_unreachable("unhandled reduction opcode");
2001 }
2002}
2003
2004/// Get 3-input scalar reduction opcode
2005static std::optional<NVPTXISD::NodeType>
2006getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
2007 switch (ReductionOpcode) {
2008 case ISD::VECREDUCE_FMAX:
2009 return NVPTXISD::FMAXNUM3;
2010 case ISD::VECREDUCE_FMIN:
2011 return NVPTXISD::FMINNUM3;
2012 case ISD::VECREDUCE_FMAXIMUM:
2013 return NVPTXISD::FMAXIMUM3;
2014 case ISD::VECREDUCE_FMINIMUM:
2015 return NVPTXISD::FMINIMUM3;
2016 default:
2017 return std::nullopt;
2018 }
2019}
2020
2021/// Lower reductions to either a sequence of operations or a tree if
2022/// reassociations are allowed. This method will use larger operations like
2023/// max3/min3 when the target supports them.
2024SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
2025 SelectionDAG &DAG) const {
2026 SDLoc DL(Op);
2027 const SDNodeFlags Flags = Op->getFlags();
2028 SDValue Vector = Op.getOperand(0);
2029
2030 const unsigned Opcode = Op->getOpcode();
2031 const EVT EltTy = Vector.getValueType().getVectorElementType();
2032
2033 // Whether we can use 3-input min/max when expanding the reduction.
2034 const bool CanUseMinMax3 =
2035 EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
2036 STI.getPTXVersion() >= 88 &&
2037 (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
2038 Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
2039
2040 // A list of SDNode opcodes with equivalent semantics, sorted descending by
2041 // number of inputs they take.
2042 SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
2043
2044 if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
2045 CanUseMinMax3 && Opcode3Elem)
2046 ScalarOps.push_back({*Opcode3Elem, 3});
2047 ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
2048
2050 DAG.ExtractVectorElements(Vector, Elements);
2051
2052 return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
2053}
2054
2055SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
2056 // Handle bitcasting from v2i8 without hitting the default promotion
2057 // strategy which goes through stack memory.
2058 EVT FromVT = Op->getOperand(0)->getValueType(0);
2059 if (FromVT != MVT::v2i8) {
2060 return Op;
2061 }
2062
2063 // Pack vector elements into i16 and bitcast to final type
2064 SDLoc DL(Op);
2065 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2066 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
2067 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2068 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
2069 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
2070 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
2071 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
2072 SDValue AsInt = DAG.getNode(
2073 ISD::OR, DL, MVT::i16,
2074 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
2075 EVT ToVT = Op->getValueType(0);
2076 return DAG.getBitcast(ToVT, AsInt);
2077}
2078
2079// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2080// would get lowered as two constant loads and vector-packing move.
2081// Instead we want just a constant move:
2082// mov.b32 %r2, 0x40003C00
2083SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2084 SelectionDAG &DAG) const {
2085 EVT VT = Op->getValueType(0);
2086 if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))
2087 return Op;
2088 SDLoc DL(Op);
2089
2090 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2091 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2092 isa<ConstantFPSDNode>(Operand);
2093 })) {
2094 if (VT != MVT::v4i8)
2095 return Op;
2096 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2097 // to optimize calculation of constant parts.
2098 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
2099 uint64_t SelectionValue) -> SDValue {
2100 SDValue L = Left;
2101 SDValue R = Right;
2102 if (Cast) {
2103 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
2104 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
2105 }
2106 return getPRMT(L, R, SelectionValue, DL, DAG);
2107 };
2108 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
2109 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
2110 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
2111 return DAG.getBitcast(VT, PRMT3210);
2112 }
2113
2114 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2115 auto GetOperand = [](SDValue Op, int N) -> APInt {
2116 const SDValue &Operand = Op->getOperand(N);
2117 EVT VT = Op->getValueType(0);
2118 if (Operand->isUndef())
2119 return APInt(32, 0);
2120 APInt Value;
2121 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2122 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2123 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2124 Value = Operand->getAsAPIntVal();
2125 else
2126 llvm_unreachable("Unsupported type");
2127 // i8 values are carried around as i16, so we need to zero out upper bits,
2128 // so they do not get in the way of combining individual byte values
2129 if (VT == MVT::v4i8)
2130 Value = Value.trunc(8);
2131 return Value.zext(32);
2132 };
2133
2134 // Construct a 32-bit constant by shifting into place smaller values
2135 // (elements of the vector type VT).
2136 // For example, if VT has 2 elements, then N == 2:
2137 // ShiftAmount = 32 / N = 16
2138 // Value |= Op0 (b16) << 0
2139 // Value |= Op1 (b16) << 16
2140 // If N == 4:
2141 // ShiftAmount = 32 / N = 8
2142 // Value |= Op0 (b8) << 0
2143 // Value |= Op1 (b8) << 8
2144 // Value |= Op2 (b8) << 16
2145 // Value |= Op3 (b8) << 24
2146 // ...etc
2147 APInt Value(32, 0);
2148 const unsigned NumElements = VT.getVectorNumElements();
2149 assert(32 % NumElements == 0 && "must evenly divide bit length");
2150 const unsigned ShiftAmount = 32 / NumElements;
2151 for (unsigned ElementNo : seq(NumElements))
2152 Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);
2153 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
2154 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
2155}
2156
2157SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2158 SelectionDAG &DAG) const {
2159 SDValue Index = Op->getOperand(1);
2160 SDValue Vector = Op->getOperand(0);
2161 SDLoc DL(Op);
2162 EVT VectorVT = Vector.getValueType();
2163
2164 if (VectorVT == MVT::v4i8) {
2165 SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
2166 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2167 DAG.getConstant(0x7770, DL, MVT::i32));
2168 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
2169 DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
2170 SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
2171 SDNodeFlags Flags;
2172 Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
2173 Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
2174 Ext->setFlags(Flags);
2175 return Ext;
2176 }
2177
2178 // Constant index will be matched by tablegen.
2179 if (isa<ConstantSDNode>(Index.getNode()))
2180 return Op;
2181
2182 // Extract individual elements and select one of them.
2183 assert(NVPTX::isPackedVectorTy(VectorVT) &&
2184 VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");
2185 EVT EltVT = VectorVT.getVectorElementType();
2186
2187 SDLoc dl(Op.getNode());
2188 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2189 DAG.getIntPtrConstant(0, dl));
2190 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2191 DAG.getIntPtrConstant(1, dl));
2192 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2194}
2195
2196SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2197 SelectionDAG &DAG) const {
2198 SDValue Vector = Op->getOperand(0);
2199 EVT VectorVT = Vector.getValueType();
2200
2201 if (VectorVT != MVT::v4i8)
2202 return Op;
2203 SDLoc DL(Op);
2204 SDValue Value = Op->getOperand(1);
2205 if (Value->isUndef())
2206 return Vector;
2207
2208 SDValue Index = Op->getOperand(2);
2209
2210 SDValue BFI =
2211 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2212 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2213 DAG.getNode(ISD::MUL, DL, MVT::i32,
2214 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2215 DAG.getConstant(8, DL, MVT::i32)),
2216 DAG.getConstant(8, DL, MVT::i32)});
2217 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2218}
2219
2220SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2221 SelectionDAG &DAG) const {
2222 SDValue V1 = Op.getOperand(0);
2223 EVT VectorVT = V1.getValueType();
2224 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2225 return Op;
2226
2227 // Lower shuffle to PRMT instruction.
2228 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2229 SDValue V2 = Op.getOperand(1);
2230 uint32_t Selector = 0;
2231 for (auto I : llvm::enumerate(SVN->getMask())) {
2232 if (I.value() != -1) // -1 is a placeholder for undef.
2233 Selector |= (I.value() << (I.index() * 4));
2234 }
2235
2236 SDLoc DL(Op);
2237 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
2238 DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
2239 return DAG.getBitcast(Op.getValueType(), PRMT);
2240}
2241/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2242/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2243/// amount, or
2244/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2245/// amount.
2246SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2247 SelectionDAG &DAG) const {
2248 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2249 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2250
2251 EVT VT = Op.getValueType();
2252 unsigned VTBits = VT.getSizeInBits();
2253 SDLoc dl(Op);
2254 SDValue ShOpLo = Op.getOperand(0);
2255 SDValue ShOpHi = Op.getOperand(1);
2256 SDValue ShAmt = Op.getOperand(2);
2257 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2258
2259 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2260 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2261 // {dHi, dLo} = {aHi, aLo} >> Amt
2262 // dHi = aHi >> Amt
2263 // dLo = shf.r.clamp aLo, aHi, Amt
2264
2265 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2266 SDValue Lo =
2267 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2268
2269 SDValue Ops[2] = { Lo, Hi };
2270 return DAG.getMergeValues(Ops, dl);
2271 }
2272 else {
2273 // {dHi, dLo} = {aHi, aLo} >> Amt
2274 // - if (Amt>=size) then
2275 // dLo = aHi >> (Amt-size)
2276 // dHi = aHi >> Amt (this is either all 0 or all 1)
2277 // else
2278 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2279 // dHi = aHi >> Amt
2280
2281 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2282 DAG.getConstant(VTBits, dl, MVT::i32),
2283 ShAmt);
2284 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2285 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2286 DAG.getConstant(VTBits, dl, MVT::i32));
2287 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2288 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2289 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2290
2291 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2292 DAG.getConstant(VTBits, dl, MVT::i32),
2293 ISD::SETGE);
2294 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2295 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2296
2297 SDValue Ops[2] = { Lo, Hi };
2298 return DAG.getMergeValues(Ops, dl);
2299 }
2300}
2301
2302/// LowerShiftLeftParts - Lower SHL_PARTS, which
2303/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2304/// amount, or
2305/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2306/// amount.
2307SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2308 SelectionDAG &DAG) const {
2309 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2310 assert(Op.getOpcode() == ISD::SHL_PARTS);
2311
2312 EVT VT = Op.getValueType();
2313 unsigned VTBits = VT.getSizeInBits();
2314 SDLoc dl(Op);
2315 SDValue ShOpLo = Op.getOperand(0);
2316 SDValue ShOpHi = Op.getOperand(1);
2317 SDValue ShAmt = Op.getOperand(2);
2318
2319 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2320 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2321 // {dHi, dLo} = {aHi, aLo} << Amt
2322 // dHi = shf.l.clamp aLo, aHi, Amt
2323 // dLo = aLo << Amt
2324
2325 SDValue Hi =
2326 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2327 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2328
2329 SDValue Ops[2] = { Lo, Hi };
2330 return DAG.getMergeValues(Ops, dl);
2331 }
2332 else {
2333 // {dHi, dLo} = {aHi, aLo} << Amt
2334 // - if (Amt>=size) then
2335 // dLo = aLo << Amt (all 0)
2336 // dLo = aLo << (Amt-size)
2337 // else
2338 // dLo = aLo << Amt
2339 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2340
2341 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2342 DAG.getConstant(VTBits, dl, MVT::i32),
2343 ShAmt);
2344 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2345 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2346 DAG.getConstant(VTBits, dl, MVT::i32));
2347 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2348 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2349 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2350
2351 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2352 DAG.getConstant(VTBits, dl, MVT::i32),
2353 ISD::SETGE);
2354 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2355 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2356
2357 SDValue Ops[2] = { Lo, Hi };
2358 return DAG.getMergeValues(Ops, dl);
2359 }
2360}
2361
2362/// If the types match, convert the generic copysign to the NVPTXISD version,
2363/// otherwise bail ensuring that mismatched cases are properly expaned.
2364SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2365 SelectionDAG &DAG) const {
2366 EVT VT = Op.getValueType();
2367 SDLoc DL(Op);
2368
2369 SDValue In1 = Op.getOperand(0);
2370 SDValue In2 = Op.getOperand(1);
2371 EVT SrcVT = In2.getValueType();
2372
2373 if (!SrcVT.bitsEq(VT))
2374 return SDValue();
2375
2376 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2377}
2378
2379SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2380 EVT VT = Op.getValueType();
2381
2382 if (VT == MVT::f32)
2383 return LowerFROUND32(Op, DAG);
2384
2385 if (VT == MVT::f64)
2386 return LowerFROUND64(Op, DAG);
2387
2388 llvm_unreachable("unhandled type");
2389}
2390
2391// This is the the rounding method used in CUDA libdevice in C like code:
2392// float roundf(float A)
2393// {
2394// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2395// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2396// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2397// }
2398SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2399 SelectionDAG &DAG) const {
2400 SDLoc SL(Op);
2401 SDValue A = Op.getOperand(0);
2402 EVT VT = Op.getValueType();
2403
2404 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2405
2406 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2407 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2408 const unsigned SignBitMask = 0x80000000;
2409 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2410 DAG.getConstant(SignBitMask, SL, MVT::i32));
2411 const unsigned PointFiveInBits = 0x3F000000;
2412 SDValue PointFiveWithSignRaw =
2413 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2414 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2415 SDValue PointFiveWithSign =
2416 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2417 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2418 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2419
2420 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2421 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2422 SDValue IsLarge =
2423 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2424 ISD::SETOGT);
2425 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2426
2427 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2428 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2429 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2430 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2431 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2432}
2433
2434// The implementation of round(double) is similar to that of round(float) in
2435// that they both separate the value range into three regions and use a method
2436// specific to the region to round the values. However, round(double) first
2437// calculates the round of the absolute value and then adds the sign back while
2438// round(float) directly rounds the value with sign.
2439SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2440 SelectionDAG &DAG) const {
2441 SDLoc SL(Op);
2442 SDValue A = Op.getOperand(0);
2443 EVT VT = Op.getValueType();
2444
2445 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2446
2447 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2448 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2449 DAG.getConstantFP(0.5, SL, VT));
2450 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2451
2452 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2453 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2454 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2455 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2456 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2457 DAG.getConstantFP(0, SL, VT),
2458 RoundedA);
2459
2460 // Add sign to rounded_A
2461 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2462 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2463
2464 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2465 SDValue IsLarge =
2466 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2467 ISD::SETOGT);
2468 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2469}
2470
2472 EVT VT = N->getValueType(0);
2473 EVT NVT = MVT::f32;
2474 if (VT.isVector()) {
2475 NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
2476 }
2477 SDLoc DL(N);
2478 SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
2479 SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
2480 SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
2481 return DAG.getFPExtendOrRound(Res, DL, VT);
2482}
2483
2484SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
2485 SelectionDAG &DAG) const {
2486 if (useF32FTZ(DAG.getMachineFunction())) {
2487 return PromoteBinOpToF32(Op.getNode(), DAG);
2488 }
2489 return Op;
2490}
2491
2492SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2493 SelectionDAG &DAG) const {
2494 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2495
2496 if (Op.getValueType() == MVT::bf16) {
2497 SDLoc Loc(Op);
2498 return DAG.getNode(
2499 ISD::FP_ROUND, Loc, MVT::bf16,
2500 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2501 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2502 }
2503
2504 // Everything else is considered legal.
2505 return Op;
2506}
2507
2508SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2509 SelectionDAG &DAG) const {
2510 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2511
2512 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2513 SDLoc Loc(Op);
2514 return DAG.getNode(
2515 Op.getOpcode(), Loc, Op.getValueType(),
2516 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2517 }
2518
2519 // Everything else is considered legal.
2520 return Op;
2521}
2522
2523SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2524 SelectionDAG &DAG) const {
2525 EVT NarrowVT = Op.getValueType();
2526 SDValue Wide = Op.getOperand(0);
2527 EVT WideVT = Wide.getValueType();
2528 if (NarrowVT.getScalarType() == MVT::bf16) {
2529 const TargetLowering *TLI = STI.getTargetLowering();
2530 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2531 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2532 }
2533 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2534 // This combination was the first to support f32 -> bf16.
2535 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2536 if (WideVT.getScalarType() == MVT::f32) {
2537 return Op;
2538 }
2539 if (WideVT.getScalarType() == MVT::f64) {
2540 SDLoc Loc(Op);
2541 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2542 // the hardware f32 -> bf16 instruction.
2544 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2545 : MVT::f32,
2546 Wide, Loc, DAG);
2547 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2548 }
2549 }
2550 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2551 }
2552 }
2553
2554 // Everything else is considered legal.
2555 return Op;
2556}
2557
2558SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2559 SelectionDAG &DAG) const {
2560 SDValue Narrow = Op.getOperand(0);
2561 EVT NarrowVT = Narrow.getValueType();
2562 EVT WideVT = Op.getValueType();
2563 if (NarrowVT.getScalarType() == MVT::bf16) {
2564 if (WideVT.getScalarType() == MVT::f32 &&
2565 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2566 SDLoc Loc(Op);
2567 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2568 }
2569 if (WideVT.getScalarType() == MVT::f64 &&
2570 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2571 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2572 : MVT::f32;
2573 SDLoc Loc(Op);
2574 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2575 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2576 } else {
2577 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2578 }
2579 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2580 }
2581 }
2582
2583 // Everything else is considered legal.
2584 return Op;
2585}
2586
2588 SDLoc DL(Op);
2589 if (Op.getValueType() != MVT::v2i16)
2590 return Op;
2591 EVT EltVT = Op.getValueType().getVectorElementType();
2592 SmallVector<SDValue> VecElements;
2593 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2594 SmallVector<SDValue> ScalarArgs;
2595 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2596 [&](const SDUse &O) {
2597 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2598 O.get(), DAG.getIntPtrConstant(I, DL));
2599 });
2600 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2601 }
2602 SDValue V =
2603 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2604 return V;
2605}
2606
2608 SDNode *N = Op.getNode();
2609 SDLoc DL(N);
2611
2612 // split the vector argument
2613 for (size_t I = 0; I < N->getNumOperands(); I++) {
2614 SDValue Val = N->getOperand(I);
2615 EVT ValVT = Val.getValueType();
2616 if (ValVT.isVector()) {
2617 EVT EltVT = ValVT.getVectorElementType();
2618 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2619 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2620 DAG.getIntPtrConstant(J, DL)));
2621 } else
2622 Ops.push_back(Val);
2623 }
2624
2626 SDValue Tcgen05StNode =
2627 DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, N->getVTList(), Ops,
2628 MemSD->getMemoryVT(), MemSD->getMemOperand());
2629
2630 return Tcgen05StNode;
2631}
2632
2633static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
2634 switch (IID) {
2635 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2637 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2639 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2641 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2643 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2645 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2647 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2649 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2651 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2653 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2655 case Intrinsic::
2656 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2658 case Intrinsic::
2659 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2661 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2663 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2665 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2667 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2669 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2671 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2673 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2675 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2677 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2679 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2681 case Intrinsic::
2682 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2683 return NVPTXISD::
2684 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2685 case Intrinsic::
2686 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2687 return NVPTXISD::
2688 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2689 };
2690 llvm_unreachable("unhandled tcgen05.mma.disable_output_lane intrinsic");
2691}
2692
2694 SDNode *N = Op.getNode();
2695 SDLoc DL(N);
2696 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2697
2699 // split the vector argument
2700 for (size_t I = 0; I < N->getNumOperands(); I++) {
2701 if (I == 1)
2702 continue; // skip IID
2703 SDValue Val = N->getOperand(I);
2704 EVT ValVT = Val.getValueType();
2705 if (ValVT.isVector()) {
2706 EVT EltVT = ValVT.getVectorElementType();
2707 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2708 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2709 DAG.getIntPtrConstant(J, DL)));
2710 } else
2711 Ops.push_back(Val);
2712 }
2713
2715 SDValue Tcgen05MMANode = DAG.getMemIntrinsicNode(
2716 getTcgen05MMADisableOutputLane(IID), DL, N->getVTList(), Ops,
2717 MemSD->getMemoryVT(), MemSD->getMemOperand());
2718
2719 return Tcgen05MMANode;
2720}
2721
2723 SDNode *N = Op.getNode();
2724 SDValue Intrin = N->getOperand(1);
2725
2726 // Get the intrinsic ID
2727 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
2728 switch (IntrinNo) {
2729 default:
2730 break;
2731 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2732 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2733 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2734 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2735 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2736 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2737 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2738 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2739 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2740 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2741 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2742 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2743 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2744 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2745 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2746 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2747 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2748 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2749 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2750 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2751 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2752 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2753 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2754 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2755 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2756 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2757 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2758 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2759 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2760 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2761 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2762 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2763 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2764 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2765 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2766 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2767 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2768 return LowerTcgen05St(Op, DAG);
2769 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2770 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2771 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2772 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2773 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2774 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2775 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2776 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2777 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2778 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2779 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2780 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2781 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2782 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2783 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2784 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2785 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2786 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2787 case Intrinsic::
2788 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2789 case Intrinsic::
2790 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2791 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2792 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2793 case Intrinsic::
2794 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2795 case Intrinsic::
2796 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2798 }
2799 return Op;
2800}
2801
2803 SelectionDAG &DAG) {
2804
2805 SDNode *N = Op.getNode();
2806 if (N->getOperand(1).getValueType() != MVT::i128) {
2807 // return, if the operand is already lowered
2808 return SDValue();
2809 }
2810
2811 unsigned IID =
2812 cast<ConstantSDNode>(N->getOperand(0).getNode())->getZExtValue();
2813 auto Opcode = [&]() {
2814 switch (IID) {
2815 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2817 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2819 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2821 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2823 default:
2824 llvm_unreachable("unsupported/unhandled intrinsic");
2825 }
2826 }();
2827
2828 SDLoc DL(N);
2829 SDValue TryCancelResponse = N->getOperand(1);
2830 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse);
2831 SDValue TryCancelResponse0 =
2832 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2833 DAG.getIntPtrConstant(0, DL));
2834 SDValue TryCancelResponse1 =
2835 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2836 DAG.getIntPtrConstant(1, DL));
2837
2838 return DAG.getNode(Opcode, DL, N->getVTList(),
2839 {TryCancelResponse0, TryCancelResponse1});
2840}
2841
2843 const unsigned Mode = [&]() {
2844 switch (Op->getConstantOperandVal(0)) {
2845 case Intrinsic::nvvm_prmt:
2847 case Intrinsic::nvvm_prmt_b4e:
2849 case Intrinsic::nvvm_prmt_ecl:
2851 case Intrinsic::nvvm_prmt_ecr:
2853 case Intrinsic::nvvm_prmt_f4e:
2855 case Intrinsic::nvvm_prmt_rc16:
2857 case Intrinsic::nvvm_prmt_rc8:
2859 default:
2860 llvm_unreachable("unsupported/unhandled intrinsic");
2861 }
2862 }();
2863 SDLoc DL(Op);
2864 SDValue A = Op->getOperand(1);
2865 SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
2866 : DAG.getConstant(0, DL, MVT::i32);
2867 SDValue Selector = (Op->op_end() - 1)->get();
2868 return getPRMT(A, B, Selector, DL, DAG, Mode);
2869}
2871 switch (Op->getConstantOperandVal(0)) {
2872 default:
2873 return Op;
2874 case Intrinsic::nvvm_prmt:
2875 case Intrinsic::nvvm_prmt_b4e:
2876 case Intrinsic::nvvm_prmt_ecl:
2877 case Intrinsic::nvvm_prmt_ecr:
2878 case Intrinsic::nvvm_prmt_f4e:
2879 case Intrinsic::nvvm_prmt_rc16:
2880 case Intrinsic::nvvm_prmt_rc8:
2881 return lowerPrmtIntrinsic(Op, DAG);
2882 case Intrinsic::nvvm_internal_addrspace_wrap:
2883 return Op.getOperand(1);
2884 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2885 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2886 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2887 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2889 }
2890}
2891
2892// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
2893// Lower these into a node returning the correct type which is zero-extended
2894// back to the correct size.
2896 SDValue V = Op->getOperand(0);
2897 assert(V.getValueType() == MVT::i64 &&
2898 "Unexpected CTLZ/CTPOP type to legalize");
2899
2900 SDLoc DL(Op);
2901 SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
2902 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
2903}
2904
2906 unsigned Opcode, SelectionDAG &DAG) {
2907 assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
2908
2909 const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);
2910 if (!AmtConst)
2911 return SDValue();
2912 const auto Amt = AmtConst->getZExtValue() & 63;
2913
2914 SDValue UnpackA =
2915 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
2916 SDValue UnpackB =
2917 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
2918
2919 // Arch is Little endiain: 0 = low bits, 1 = high bits
2920 SDValue ALo = UnpackA.getValue(0);
2921 SDValue AHi = UnpackA.getValue(1);
2922 SDValue BLo = UnpackB.getValue(0);
2923 SDValue BHi = UnpackB.getValue(1);
2924
2925 // The bitfeild consists of { AHi : ALo : BHi : BLo }
2926 //
2927 // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
2928 // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
2929 // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
2930 // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
2931 //
2932 // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts
2933 // are not needed at all. Amt = 0 is a no-op producing either A or B depending
2934 // on the direction. Amt = 32 can be implemented by a packing and unpacking
2935 // move to select and arrange the 32bit values. For simplicity, these cases
2936 // are not handled here explicitly and instead we rely on DAGCombiner to
2937 // remove the no-op funnel shifts we insert.
2938 auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
2939 ? std::make_tuple(AHi, ALo, BHi)
2940 : std::make_tuple(ALo, BHi, BLo);
2941
2942 SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
2943 SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
2944 SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
2945
2946 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
2947}
2948
2950 return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
2951 SDLoc(Op), Op->getOpcode(), DAG);
2952}
2953
2955 unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
2956 return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
2957 SDLoc(Op), Opcode, DAG);
2958}
2959
2961 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
2962 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
2963 // the semantics of LLVM's frem.
2964 SDLoc DL(Op);
2965 SDValue X = Op->getOperand(0);
2966 SDValue Y = Op->getOperand(1);
2967 EVT Ty = Op.getValueType();
2968 SDNodeFlags Flags = Op->getFlags();
2969
2970 SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
2971 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
2972 SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
2974 SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
2976
2977 if (Flags.hasNoInfs())
2978 return Sub;
2979
2980 // If Y is infinite, return X
2981 SDValue AbsY = DAG.getNode(ISD::FABS, DL, Ty, Y);
2982 SDValue Inf =
2983 DAG.getConstantFP(APFloat::getInf(Ty.getFltSemantics()), DL, Ty);
2984 SDValue IsInf = DAG.getSetCC(DL, MVT::i1, AbsY, Inf, ISD::SETEQ);
2985 return DAG.getSelect(DL, Ty, IsInf, X, Sub);
2986}
2987
2989 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2990
2991 SDValue Cond = Op->getOperand(0);
2992 SDValue TrueVal = Op->getOperand(1);
2993 SDValue FalseVal = Op->getOperand(2);
2994 SDLoc DL(Op);
2995
2996 // If both operands are truncated, we push the select through the truncates.
2997 if (TrueVal.getOpcode() == ISD::TRUNCATE &&
2998 FalseVal.getOpcode() == ISD::TRUNCATE) {
2999 TrueVal = TrueVal.getOperand(0);
3000 FalseVal = FalseVal.getOperand(0);
3001
3002 EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())
3003 ? TrueVal.getValueType()
3004 : FalseVal.getValueType();
3005 TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);
3006 FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);
3007 SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);
3008 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
3009 }
3010
3011 // Otherwise, expand the select into a series of logical operations. These
3012 // often can be folded into other operations either by us or ptxas.
3013 TrueVal = DAG.getFreeze(TrueVal);
3014 FalseVal = DAG.getFreeze(FalseVal);
3015 SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);
3016 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
3017 SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);
3018 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);
3019 return Or;
3020}
3021
3022SDValue
3024 switch (Op.getOpcode()) {
3025 case ISD::RETURNADDR:
3026 return SDValue();
3027 case ISD::FRAMEADDR:
3028 return SDValue();
3029 case ISD::ADDRSPACECAST:
3030 return LowerADDRSPACECAST(Op, DAG);
3032 return Op;
3034 return lowerIntrinsicWOChain(Op, DAG);
3036 return LowerIntrinsicVoid(Op, DAG);
3037 case ISD::BUILD_VECTOR:
3038 return LowerBUILD_VECTOR(Op, DAG);
3039 case ISD::BITCAST:
3040 return LowerBITCAST(Op, DAG);
3042 return Op;
3044 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
3046 return LowerINSERT_VECTOR_ELT(Op, DAG);
3048 return LowerVECTOR_SHUFFLE(Op, DAG);
3050 return LowerCONCAT_VECTORS(Op, DAG);
3051 case ISD::VECREDUCE_FMAX:
3052 case ISD::VECREDUCE_FMIN:
3053 case ISD::VECREDUCE_FMAXIMUM:
3054 case ISD::VECREDUCE_FMINIMUM:
3055 return LowerVECREDUCE(Op, DAG);
3056 case ISD::STORE:
3057 return LowerSTORE(Op, DAG);
3058 case ISD::LOAD:
3059 return LowerLOAD(Op, DAG);
3060 case ISD::SHL_PARTS:
3061 return LowerShiftLeftParts(Op, DAG);
3062 case ISD::SRA_PARTS:
3063 case ISD::SRL_PARTS:
3064 return LowerShiftRightParts(Op, DAG);
3065 case ISD::SELECT:
3066 return lowerSELECT(Op, DAG);
3067 case ISD::FROUND:
3068 return LowerFROUND(Op, DAG);
3069 case ISD::FCOPYSIGN:
3070 return LowerFCOPYSIGN(Op, DAG);
3071 case ISD::SINT_TO_FP:
3072 case ISD::UINT_TO_FP:
3073 return LowerINT_TO_FP(Op, DAG);
3074 case ISD::FP_TO_SINT:
3075 case ISD::FP_TO_UINT:
3076 return LowerFP_TO_INT(Op, DAG);
3077 case ISD::FP_ROUND:
3078 return LowerFP_ROUND(Op, DAG);
3079 case ISD::FP_EXTEND:
3080 return LowerFP_EXTEND(Op, DAG);
3081 case ISD::BR_JT:
3082 return LowerBR_JT(Op, DAG);
3083 case ISD::VAARG:
3084 return LowerVAARG(Op, DAG);
3085 case ISD::VASTART:
3086 return LowerVASTART(Op, DAG);
3087 case ISD::FSHL:
3088 case ISD::FSHR:
3089 return lowerFSH(Op, DAG);
3090 case ISD::ROTL:
3091 case ISD::ROTR:
3092 return lowerROT(Op, DAG);
3093 case ISD::ABS:
3094 case ISD::SMIN:
3095 case ISD::SMAX:
3096 case ISD::UMIN:
3097 case ISD::UMAX:
3098 case ISD::ADD:
3099 case ISD::SUB:
3100 case ISD::MUL:
3101 case ISD::SHL:
3102 case ISD::SREM:
3103 case ISD::UREM:
3104 return LowerVectorArith(Op, DAG);
3105 case ISD::DYNAMIC_STACKALLOC:
3106 return LowerDYNAMIC_STACKALLOC(Op, DAG);
3107 case ISD::STACKRESTORE:
3108 return LowerSTACKRESTORE(Op, DAG);
3109 case ISD::STACKSAVE:
3110 return LowerSTACKSAVE(Op, DAG);
3111 case ISD::CopyToReg:
3112 return LowerCopyToReg_128(Op, DAG);
3113 case ISD::FADD:
3114 case ISD::FSUB:
3115 case ISD::FMUL:
3116 // Used only for bf16 on SM80, where we select fma for non-ftz operation
3117 return PromoteBinOpIfF32FTZ(Op, DAG);
3118 case ISD::CTPOP:
3119 case ISD::CTLZ:
3120 return lowerCTLZCTPOP(Op, DAG);
3121 case ISD::FREM:
3122 return lowerFREM(Op, DAG);
3123
3124 default:
3125 llvm_unreachable("Custom lowering not defined for operation");
3126 }
3127}
3128
3129SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
3130 SDLoc DL(Op);
3131 SDValue Chain = Op.getOperand(0);
3132 const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
3133 SDValue Index = Op.getOperand(2);
3134
3135 unsigned JId = JT->getIndex();
3137 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
3138
3139 SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);
3140
3141 // Generate BrxStart node
3142 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
3143 Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);
3144
3145 // Generate BrxItem nodes
3146 assert(!MBBs.empty());
3147 for (MachineBasicBlock *MBB : MBBs.drop_back())
3148 Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
3149 DAG.getBasicBlock(MBB), Chain.getValue(1));
3150
3151 // Generate BrxEnd nodes
3152 SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
3153 IdV, Chain.getValue(1)};
3154 SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);
3155
3156 return BrxEnd;
3157}
3158
3159// This will prevent AsmPrinter from trying to print the jump tables itself.
3163
3164SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
3165 SelectionDAG &DAG) const {
3167 unsigned SrcAS = N->getSrcAddressSpace();
3168 unsigned DestAS = N->getDestAddressSpace();
3169 if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
3170 DestAS != llvm::ADDRESS_SPACE_GENERIC) {
3171 // Shared and SharedCluster can be converted to each other through generic
3172 // space
3173 if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3176 DestAS == llvm::ADDRESS_SPACE_SHARED)) {
3177 SDLoc DL(Op.getNode());
3178 const MVT GenerictVT =
3180 SDValue GenericConversion = DAG.getAddrSpaceCast(
3181 DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);
3182 SDValue SharedClusterConversion =
3183 DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,
3184 ADDRESS_SPACE_GENERIC, DestAS);
3185 return SharedClusterConversion;
3186 }
3187
3188 return DAG.getUNDEF(Op.getValueType());
3189 }
3190
3191 return Op;
3192}
3193
3194// This function is almost a copy of SelectionDAG::expandVAArg().
3195// The only diff is that this one produces loads from local address space.
3196SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3197 const TargetLowering *TLI = STI.getTargetLowering();
3198 SDLoc DL(Op);
3199
3200 SDNode *Node = Op.getNode();
3201 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3202 EVT VT = Node->getValueType(0);
3203 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
3204 SDValue Tmp1 = Node->getOperand(0);
3205 SDValue Tmp2 = Node->getOperand(1);
3206 const MaybeAlign MA(Node->getConstantOperandVal(3));
3207
3208 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
3209 Tmp1, Tmp2, MachinePointerInfo(V));
3210 SDValue VAList = VAListLoad;
3211
3212 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
3213 VAList = DAG.getNode(
3214 ISD::ADD, DL, VAList.getValueType(), VAList,
3215 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
3216
3217 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
3218 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
3219 VAList.getValueType()));
3220 }
3221
3222 // Increment the pointer, VAList, to the next vaarg
3223 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
3225 DL, VAList.getValueType()));
3226
3227 // Store the incremented VAList to the legalized pointer
3228 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
3229 MachinePointerInfo(V));
3230
3231 const Value *SrcV = Constant::getNullValue(
3233
3234 // Load the actual argument out of the pointer VAList
3235 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
3236}
3237
3238SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3239 const TargetLowering *TLI = STI.getTargetLowering();
3240 SDLoc DL(Op);
3241 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
3242
3243 // Store the address of unsized array <function>_vararg[] in the ap object.
3244 SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
3245
3246 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3247 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
3248 MachinePointerInfo(SV));
3249}
3250
3251/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
3252static std::optional<std::pair<SDValue, SDValue>>
3255 const EVT ResVT = LD->getValueType(0);
3256 const EVT MemVT = LD->getMemoryVT();
3257
3258 // If we're doing sign/zero extension as part of the load, avoid lowering to
3259 // a LoadV node. TODO: consider relaxing this restriction.
3260 if (ResVT != MemVT)
3261 return std::nullopt;
3262
3263 const auto NumEltsAndEltVT =
3264 getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
3265 if (!NumEltsAndEltVT)
3266 return std::nullopt;
3267 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3268
3269 Align Alignment = LD->getAlign();
3270 const auto &TD = DAG.getDataLayout();
3271 Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
3272 if (Alignment < PrefAlign) {
3273 // This load is not sufficiently aligned, so bail out and let this vector
3274 // load be scalarized. Note that we may still be able to emit smaller
3275 // vector loads. For example, if we are loading a <4 x float> with an
3276 // alignment of 8, this check will fail but the legalizer will try again
3277 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3278 return std::nullopt;
3279 }
3280
3281 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
3282 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3283 // loaded type to i16 and propagate the "real" type as the memory type.
3284 const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
3285
3286 unsigned Opcode;
3287 switch (NumElts) {
3288 default:
3289 return std::nullopt;
3290 case 2:
3291 Opcode = NVPTXISD::LoadV2;
3292 break;
3293 case 4:
3294 Opcode = NVPTXISD::LoadV4;
3295 break;
3296 case 8:
3297 Opcode = NVPTXISD::LoadV8;
3298 break;
3299 }
3300 auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
3301 ListVTs.push_back(MVT::Other);
3302 SDVTList LdResVTs = DAG.getVTList(ListVTs);
3303
3304 SDLoc DL(LD);
3305
3306 // Copy regular operands
3307 SmallVector<SDValue, 8> OtherOps(LD->ops());
3308
3309 // The select routine does not have access to the LoadSDNode instance, so
3310 // pass along the extension information
3311 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
3312
3313 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
3314 LD->getMemOperand());
3315
3316 SmallVector<SDValue> ScalarRes;
3317 if (EltVT.isVector()) {
3319 assert(NumElts * EltVT.getVectorNumElements() ==
3320 ResVT.getVectorNumElements());
3321 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
3322 // into individual elements.
3323 for (const unsigned I : llvm::seq(NumElts)) {
3324 SDValue SubVector = NewLD.getValue(I);
3325 DAG.ExtractVectorElements(SubVector, ScalarRes);
3326 }
3327 } else {
3328 for (const unsigned I : llvm::seq(NumElts)) {
3329 SDValue Res = NewLD.getValue(I);
3330 if (LoadEltVT != EltVT)
3331 Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
3332 ScalarRes.push_back(Res);
3333 }
3334 }
3335
3336 SDValue LoadChain = NewLD.getValue(NumElts);
3337
3338 const MVT BuildVecVT =
3339 MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
3340 SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
3341 SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
3342
3343 return {{LoadValue, LoadChain}};
3344}
3345
3348 const NVPTXSubtarget &STI) {
3349 if (auto Res = replaceLoadVector(N, DAG, STI))
3350 Results.append({Res->first, Res->second});
3351}
3352
3354 const NVPTXSubtarget &STI) {
3355 if (auto Res = replaceLoadVector(N, DAG, STI))
3356 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
3357 return SDValue();
3358}
3359
3360// v = ld i1* addr
3361// =>
3362// v1 = ld i8* addr (-> i16)
3363// v = trunc i16 to i1
3365 SDLoc dl(LD);
3366 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
3367 assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
3368 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
3369 LD->getBasePtr(), LD->getPointerInfo(),
3370 MVT::i8, LD->getAlign(),
3371 LD->getMemOperand()->getFlags());
3372 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
3373 // The legalizer (the caller) is expecting two values from the legalized
3374 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
3375 // in LegalizeDAG.cpp which also uses MergeValues.
3376 return DAG.getMergeValues({result, LD->getChain()}, dl);
3377}
3378
3379SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
3380 LoadSDNode *LD = cast<LoadSDNode>(Op);
3381
3382 if (Op.getValueType() == MVT::i1)
3383 return lowerLOADi1(LD, DAG);
3384
3385 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
3386 // how they'll be lowered in ISel anyway, and by doing this a little earlier
3387 // we allow for more DAG combine opportunities.
3388 if (LD->getExtensionType() == ISD::EXTLOAD) {
3389 assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
3390 "Unexpected fpext-load");
3391 return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
3392 LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
3393 LD->getMemOperand());
3394 }
3395
3396 llvm_unreachable("Unexpected custom lowering for load");
3397}
3398
3400 const NVPTXSubtarget &STI) {
3401 MemSDNode *N = cast<MemSDNode>(Op.getNode());
3402 SDValue Val = N->getOperand(1);
3403 SDLoc DL(N);
3404 const EVT ValVT = Val.getValueType();
3405 const EVT MemVT = N->getMemoryVT();
3406
3407 // If we're truncating as part of the store, avoid lowering to a StoreV node.
3408 // TODO: consider relaxing this restriction.
3409 if (ValVT != MemVT)
3410 return SDValue();
3411
3412 const auto NumEltsAndEltVT =
3413 getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
3414 if (!NumEltsAndEltVT)
3415 return SDValue();
3416 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3417
3418 const DataLayout &TD = DAG.getDataLayout();
3419
3420 Align Alignment = N->getAlign();
3421 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3422 if (Alignment < PrefAlign) {
3423 // This store is not sufficiently aligned, so bail out and let this vector
3424 // store be scalarized. Note that we may still be able to emit smaller
3425 // vector stores. For example, if we are storing a <4 x float> with an
3426 // alignment of 8, this check will fail but the legalizer will try again
3427 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3428 return SDValue();
3429 }
3430
3431 unsigned Opcode;
3432 switch (NumElts) {
3433 default:
3434 return SDValue();
3435 case 2:
3436 Opcode = NVPTXISD::StoreV2;
3437 break;
3438 case 4:
3439 Opcode = NVPTXISD::StoreV4;
3440 break;
3441 case 8:
3442 Opcode = NVPTXISD::StoreV8;
3443 break;
3444 }
3445
3447
3448 // First is the chain
3449 Ops.push_back(N->getOperand(0));
3450
3451 // Then the split values
3452 if (EltVT.isVector()) {
3454 assert(NumElts * EltVT.getVectorNumElements() ==
3455 ValVT.getVectorNumElements());
3456 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
3457 // stored as b32s
3458 const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
3459 for (const unsigned I : llvm::seq(NumElts)) {
3460 SmallVector<SDValue, 4> SubVectorElts;
3461 DAG.ExtractVectorElements(Val, SubVectorElts, I * NumEltsPerSubVector,
3462 NumEltsPerSubVector);
3463 Ops.push_back(DAG.getBuildVector(EltVT, DL, SubVectorElts));
3464 }
3465 } else {
3466 SDValue V = DAG.getBitcast(MVT::getVectorVT(EltVT, NumElts), Val);
3467 for (const unsigned I : llvm::seq(NumElts)) {
3468 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, V,
3469 DAG.getIntPtrConstant(I, DL));
3470
3471 // Since StoreV2 is a target node, we cannot rely on DAG type
3472 // legalization. Therefore, we must ensure the type is legal. For i1 and
3473 // i8, we set the stored type to i16 and propagate the "real" type as the
3474 // memory type.
3475 if (EltVT.getSizeInBits() < 16)
3476 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3477 Ops.push_back(ExtVal);
3478 }
3479 }
3480
3481 // Then any remaining arguments
3482 Ops.append(N->op_begin() + 2, N->op_end());
3483
3484 SDValue NewSt =
3485 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3486 N->getMemoryVT(), N->getMemOperand());
3487
3488 // return DCI.CombineTo(N, NewSt, true);
3489 return NewSt;
3490}
3491
3492SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
3493 StoreSDNode *Store = cast<StoreSDNode>(Op);
3494 EVT VT = Store->getMemoryVT();
3495
3496 if (VT == MVT::i1)
3497 return LowerSTOREi1(Op, DAG);
3498
3499 // Lower store of any other vector type, including v2f32 as we want to break
3500 // it apart since this is not a widely-supported type.
3501 return lowerSTOREVector(Op, DAG, STI);
3502}
3503
3504// st i1 v, addr
3505// =>
3506// v1 = zxt v to i16
3507// st.u8 i16, addr
3508SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3509 SDNode *Node = Op.getNode();
3510 SDLoc dl(Node);
3511 StoreSDNode *ST = cast<StoreSDNode>(Node);
3512 SDValue Tmp1 = ST->getChain();
3513 SDValue Tmp2 = ST->getBasePtr();
3514 SDValue Tmp3 = ST->getValue();
3515 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3516 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3517 SDValue Result =
3518 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3519 ST->getAlign(), ST->getMemOperand()->getFlags());
3520 return Result;
3521}
3522
3523SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3524 SelectionDAG &DAG) const {
3525 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3526 // operand so that it can pass the legalization.
3527
3528 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3529 "Custom lowering for 128-bit CopyToReg only");
3530
3531 SDNode *Node = Op.getNode();
3532 SDLoc DL(Node);
3533
3534 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
3535 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3536 DAG.getIntPtrConstant(0, DL));
3537 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3538 DAG.getIntPtrConstant(1, DL));
3539
3541 SmallVector<EVT, 3> ResultsType(Node->values());
3542
3543 NewOps[0] = Op->getOperand(0); // Chain
3544 NewOps[1] = Op->getOperand(1); // Dst Reg
3545 NewOps[2] = Lo; // Lower 64-bit
3546 NewOps[3] = Hi; // Higher 64-bit
3547 if (Op.getNumOperands() == 4)
3548 NewOps[4] = Op->getOperand(3); // Glue if exists
3549
3550 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3551}
3552
3553unsigned NVPTXTargetLowering::getNumRegisters(
3554 LLVMContext &Context, EVT VT,
3555 std::optional<MVT> RegisterVT = std::nullopt) const {
3556 if (VT == MVT::i128 && RegisterVT == MVT::i128)
3557 return 1;
3558 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3559}
3560
3561bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3562 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3563 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3564 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3565 Parts[0] = Val;
3566 return true;
3567 }
3568 return false;
3569}
3570
3571// This creates target external symbol for a function parameter.
3572// Name of the symbol is composed from its index and the function name.
3573// Negative index corresponds to special parameter (unsized array) used for
3574// passing variable arguments.
3575SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
3576 EVT T) const {
3577 StringRef SavedStr = nvTM->getStrPool().save(
3579 return DAG.getExternalSymbol(SavedStr.data(), T);
3580}
3581
3582SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
3583 EVT T) const {
3584 const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
3585 return DAG.getExternalSymbol(SavedStr.data(), T);
3586}
3587
3589 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3590 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3591 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3592 const DataLayout &DL = DAG.getDataLayout();
3593 LLVMContext &Ctx = *DAG.getContext();
3594 auto PtrVT = getPointerTy(DAG.getDataLayout());
3595
3596 const Function &F = DAG.getMachineFunction().getFunction();
3597
3598 SDValue Root = DAG.getRoot();
3599 SmallVector<SDValue, 16> OutChains;
3600
3601 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3602 // Ins.size() will be larger
3603 // * if there is an aggregate argument with multiple fields (each field
3604 // showing up separately in Ins)
3605 // * if there is a vector argument with more than typical vector-length
3606 // elements (generally if more than 4) where each vector element is
3607 // individually present in Ins.
3608 // So a different index should be used for indexing into Ins.
3609 // See similar issue in LowerCall.
3610
3611 auto AllIns = ArrayRef(Ins);
3612 for (const auto &Arg : F.args()) {
3613 const auto ArgIns = AllIns.take_while(
3614 [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });
3615 AllIns = AllIns.drop_front(ArgIns.size());
3616
3617 Type *Ty = Arg.getType();
3618
3619 if (ArgIns.empty())
3620 report_fatal_error("Empty parameter types are not supported");
3621
3622 if (Arg.use_empty()) {
3623 // argument is dead
3624 for (const auto &In : ArgIns) {
3625 assert(!In.Used && "Arg.use_empty() is true but Arg is used?");
3626 InVals.push_back(DAG.getUNDEF(In.VT));
3627 }
3628 continue;
3629 }
3630
3631 SDValue ArgSymbol = getParamSymbol(DAG, Arg.getArgNo(), PtrVT);
3632
3633 // In the following cases, assign a node order of "i+1"
3634 // to newly created nodes. The SDNodes for params have to
3635 // appear in the same order as their order of appearance
3636 // in the original function. "i+1" holds that order.
3637 if (Arg.hasByValAttr()) {
3638 // Param has ByVal attribute
3639 // Return MoveParam(param symbol).
3640 // Ideally, the param symbol can be returned directly,
3641 // but when SDNode builder decides to use it in a CopyToReg(),
3642 // machine instruction fails because TargetExternalSymbol
3643 // (not lowered) is target dependent, and CopyToReg assumes
3644 // the source is lowered.
3645 assert(ArgIns.size() == 1 && "ByVal argument must be a pointer");
3646 const auto &ByvalIn = ArgIns[0];
3647 assert(getValueType(DL, Ty) == ByvalIn.VT &&
3648 "Ins type did not match function type");
3649 assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
3650
3651 SDValue P;
3652 if (isKernelFunction(F)) {
3653 P = ArgSymbol;
3654 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3655 } else {
3656 P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);
3657 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3658 P = DAG.getAddrSpaceCast(dl, ByvalIn.VT, P, ADDRESS_SPACE_LOCAL,
3660 }
3661 InVals.push_back(P);
3662 } else {
3665 ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
3666 assert(VTs.size() == ArgIns.size() && "Size mismatch");
3667 assert(VTs.size() == Offsets.size() && "Size mismatch");
3668
3669 const Align ArgAlign = getFunctionArgumentAlignment(
3670 &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
3671
3672 unsigned I = 0;
3673 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3674 for (const unsigned NumElts : VI) {
3675 // i1 is loaded/stored as i8
3676 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
3677 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
3678
3679 SDValue VecAddr = DAG.getObjectPtrOffset(
3680 dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
3681
3682 const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);
3683 SDValue P =
3684 DAG.getLoad(VecVT, dl, Root, VecAddr,
3688 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3689 for (const unsigned J : llvm::seq(NumElts)) {
3690 SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);
3691
3692 Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
3693 DAG, dl);
3694 InVals.push_back(Elt);
3695 }
3696 I += NumElts;
3697 }
3698 }
3699 }
3700
3701 if (!OutChains.empty())
3702 DAG.setRoot(DAG.getTokenFactor(dl, OutChains));
3703
3704 return Chain;
3705}
3706
3707SDValue
3709 bool isVarArg,
3711 const SmallVectorImpl<SDValue> &OutVals,
3712 const SDLoc &dl, SelectionDAG &DAG) const {
3713 const Function &F = DAG.getMachineFunction().getFunction();
3714 Type *RetTy = F.getReturnType();
3715
3716 if (RetTy->isVoidTy()) {
3717 assert(OutVals.empty() && Outs.empty() && "Return value expected for void");
3718 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3719 }
3720
3721 const DataLayout &DL = DAG.getDataLayout();
3722 LLVMContext &Ctx = *DAG.getContext();
3723
3724 const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
3725 const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
3726
3727 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3728 // 32-bits are sign extended or zero extended, depending on whether
3729 // they are signed or unsigned types.
3730 const bool ExtendIntegerRetVal =
3731 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3732
3735 ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
3736 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3737
3738 const auto GetRetVal = [&](unsigned I) -> SDValue {
3739 SDValue RetVal = OutVals[I];
3741 RetVal.getValueType() &&
3742 "OutVal type should always be legal");
3743
3744 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
3745 const EVT StoreVT =
3746 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
3747 return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
3748 };
3749
3750 unsigned I = 0;
3751 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
3752 for (const unsigned NumElts : VI) {
3753 const MaybeAlign CurrentAlign = ExtendIntegerRetVal
3754 ? MaybeAlign(std::nullopt)
3755 : commonAlignment(RetAlign, Offsets[I]);
3756
3758 NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });
3759
3760 SDValue Ptr =
3761 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
3762
3763 Chain = DAG.getStore(Chain, dl, Val, Ptr,
3765
3766 I += NumElts;
3767 }
3768
3769 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3770}
3771
3773 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3774 SelectionDAG &DAG) const {
3775 if (Constraint.size() > 1)
3776 return;
3778}
3779
3780// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3781// TgtMemIntrinsic
3782// because we need the information that is only available in the "Value" type
3783// of destination
3784// pointer. In particular, the address space information.
3786 IntrinsicInfo &Info, const CallInst &I,
3787 MachineFunction &MF, unsigned Intrinsic) const {
3788 switch (Intrinsic) {
3789 default:
3790 return false;
3791 case Intrinsic::nvvm_match_all_sync_i32p:
3792 case Intrinsic::nvvm_match_all_sync_i64p:
3793 Info.opc = ISD::INTRINSIC_W_CHAIN;
3794 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3795 // in order to model data exchange with other threads, but perform no real
3796 // memory accesses.
3797 Info.memVT = MVT::i1;
3798
3799 // Our result depends on both our and other thread's arguments.
3801 return true;
3802 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3803 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3804 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3805 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3806 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3807 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3808 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3809 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3810 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3811 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3812 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3813 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3814 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3815 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3816 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3817 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3818 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3819 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3820 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3821 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3822 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3823 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3824 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3825 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3826 Info.opc = ISD::INTRINSIC_W_CHAIN;
3827 Info.memVT = MVT::v8f16;
3828 Info.ptrVal = I.getArgOperand(0);
3829 Info.offset = 0;
3830 Info.flags = MachineMemOperand::MOLoad;
3831 Info.align = Align(16);
3832 return true;
3833 }
3834 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3835 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3836 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3837 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3838 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3839 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3840 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3841 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3842 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
3843 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
3844 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
3845 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
3846 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3847 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3848 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3849 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3850 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3851 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3852 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3853 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
3854 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
3855 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
3856 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
3857 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
3858 Info.opc = ISD::INTRINSIC_W_CHAIN;
3859 Info.memVT = MVT::v2i32;
3860 Info.ptrVal = I.getArgOperand(0);
3861 Info.offset = 0;
3862 Info.flags = MachineMemOperand::MOLoad;
3863 Info.align = Align(8);
3864 return true;
3865 }
3866
3867 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3868 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3869 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3870 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3871 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3872 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3873 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3874 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3875 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
3876 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
3877 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
3878 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
3879 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
3880 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
3881 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
3882 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
3883
3884 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3885 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3886 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3887 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3888 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3889 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3890 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3891 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
3892 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
3893 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
3894 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
3895 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
3896 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
3897 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
3898 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
3899 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
3900 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3901 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:
3902 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:
3903 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:
3904 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:
3905 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:
3906 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {
3907 Info.opc = ISD::INTRINSIC_W_CHAIN;
3908 Info.memVT = MVT::v4i32;
3909 Info.ptrVal = I.getArgOperand(0);
3910 Info.offset = 0;
3911 Info.flags = MachineMemOperand::MOLoad;
3912 Info.align = Align(16);
3913 return true;
3914 }
3915
3916 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3917 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3918 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3919 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3920 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3921 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3922 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3923 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3924
3925 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3926 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3927 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3928 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3929 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3930 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3931 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3932 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3933 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3934 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3935 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3936 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3937 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3938 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3939 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3940 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3941 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3942 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3943 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3944 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
3945 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
3946 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:
3947 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:
3948 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {
3949 Info.opc = ISD::INTRINSIC_W_CHAIN;
3950 Info.memVT = MVT::i32;
3951 Info.ptrVal = I.getArgOperand(0);
3952 Info.offset = 0;
3953 Info.flags = MachineMemOperand::MOLoad;
3954 Info.align = Align(4);
3955 return true;
3956 }
3957
3958 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3959 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3960 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3961 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3962 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3963 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3964 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3965 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3966 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3967 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3968 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3969 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3970 Info.opc = ISD::INTRINSIC_W_CHAIN;
3971 Info.memVT = MVT::v4f16;
3972 Info.ptrVal = I.getArgOperand(0);
3973 Info.offset = 0;
3974 Info.flags = MachineMemOperand::MOLoad;
3975 Info.align = Align(16);
3976 return true;
3977 }
3978
3979 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3980 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3981 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3982 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3983 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3984 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3985 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3986 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3987 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3988 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3989 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3990 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
3991 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
3992 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
3993 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
3994 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
3995 Info.opc = ISD::INTRINSIC_W_CHAIN;
3996 Info.memVT = MVT::v8f32;
3997 Info.ptrVal = I.getArgOperand(0);
3998 Info.offset = 0;
3999 Info.flags = MachineMemOperand::MOLoad;
4000 Info.align = Align(16);
4001 return true;
4002 }
4003
4004 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4005 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4006 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4007 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4008
4009 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4010 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4011 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4012 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4013
4014 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4015 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4016 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4017 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4018 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4019 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4020 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4021 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4022 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4023 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4024 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4025 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4026 Info.opc = ISD::INTRINSIC_W_CHAIN;
4027 Info.memVT = MVT::v8i32;
4028 Info.ptrVal = I.getArgOperand(0);
4029 Info.offset = 0;
4030 Info.flags = MachineMemOperand::MOLoad;
4031 Info.align = Align(16);
4032 return true;
4033 }
4034
4035 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4036 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4037 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4038 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4039 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4040 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4041 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4042 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4043 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4044 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:
4045 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:
4046 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:
4047 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:
4048 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:
4049 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {
4050 Info.opc = ISD::INTRINSIC_W_CHAIN;
4051 Info.memVT = MVT::v2i32;
4052 Info.ptrVal = I.getArgOperand(0);
4053 Info.offset = 0;
4054 Info.flags = MachineMemOperand::MOLoad;
4055 Info.align = Align(8);
4056 return true;
4057 }
4058
4059 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4060 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4061 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4062 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4063
4064 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4065 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4066 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4067 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4068 Info.opc = ISD::INTRINSIC_W_CHAIN;
4069 Info.memVT = MVT::f64;
4070 Info.ptrVal = I.getArgOperand(0);
4071 Info.offset = 0;
4072 Info.flags = MachineMemOperand::MOLoad;
4073 Info.align = Align(8);
4074 return true;
4075 }
4076
4077 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4078 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4079 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4080 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4081 Info.opc = ISD::INTRINSIC_W_CHAIN;
4082 Info.memVT = MVT::v2f64;
4083 Info.ptrVal = I.getArgOperand(0);
4084 Info.offset = 0;
4085 Info.flags = MachineMemOperand::MOLoad;
4086 Info.align = Align(16);
4087 return true;
4088 }
4089
4090 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4091 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4092 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4093 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4094 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4095 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4096 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4097 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4098 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4099 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4100 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4101 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4102 Info.opc = ISD::INTRINSIC_VOID;
4103 Info.memVT = MVT::v4f16;
4104 Info.ptrVal = I.getArgOperand(0);
4105 Info.offset = 0;
4106 Info.flags = MachineMemOperand::MOStore;
4107 Info.align = Align(16);
4108 return true;
4109 }
4110
4111 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4112 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4113 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4114 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4115 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4116 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4117 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4118 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4119 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4120 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4121 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4122 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4123 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4124 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4125 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4126 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4127 Info.opc = ISD::INTRINSIC_VOID;
4128 Info.memVT = MVT::v8f32;
4129 Info.ptrVal = I.getArgOperand(0);
4130 Info.offset = 0;
4131 Info.flags = MachineMemOperand::MOStore;
4132 Info.align = Align(16);
4133 return true;
4134 }
4135
4136 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4137 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4138 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4139 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4140 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4141 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4142 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4143 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4144 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4145 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4146 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4147 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4148 Info.opc = ISD::INTRINSIC_VOID;
4149 Info.memVT = MVT::v8i32;
4150 Info.ptrVal = I.getArgOperand(0);
4151 Info.offset = 0;
4152 Info.flags = MachineMemOperand::MOStore;
4153 Info.align = Align(16);
4154 return true;
4155 }
4156
4157 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4158 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4159 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4160 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4161 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4162 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4163 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4164 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
4165 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
4166 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
4167 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
4168 Info.opc = ISD::INTRINSIC_VOID;
4169 Info.memVT = MVT::v2i32;
4170 Info.ptrVal = I.getArgOperand(0);
4171 Info.offset = 0;
4172 Info.flags = MachineMemOperand::MOStore;
4173 Info.align = Align(8);
4174 return true;
4175 }
4176
4177 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4178 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4179 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4180 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4181 Info.opc = ISD::INTRINSIC_VOID;
4182 Info.memVT = MVT::v2f64;
4183 Info.ptrVal = I.getArgOperand(0);
4184 Info.offset = 0;
4185 Info.flags = MachineMemOperand::MOStore;
4186 Info.align = Align(16);
4187 return true;
4188 }
4189
4190 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
4191 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
4192 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
4193 Info.opc = ISD::INTRINSIC_VOID;
4194 Info.memVT = MVT::i32;
4195 Info.ptrVal = I.getArgOperand(0);
4196 Info.offset = 0;
4197 Info.flags = MachineMemOperand::MOStore;
4198 Info.align = Align(4);
4199 return true;
4200 }
4201
4202 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
4203 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
4204 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
4205 Info.opc = ISD::INTRINSIC_VOID;
4206 Info.memVT = MVT::v4i32;
4207 Info.ptrVal = I.getArgOperand(0);
4208 Info.offset = 0;
4209 Info.flags = MachineMemOperand::MOStore;
4210 Info.align = Align(16);
4211 return true;
4212 }
4213
4214 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4215 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4216 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4217 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4218 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4219 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4220 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4221 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4222 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4223 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4224 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4225 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4226 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4227 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4228 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4229 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4230 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4231 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4232 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4233 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4234 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4235 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4236 auto &DL = I.getDataLayout();
4237 Info.opc = ISD::INTRINSIC_W_CHAIN;
4238 Info.memVT = getValueType(DL, I.getType());
4239 Info.ptrVal = I.getArgOperand(0);
4240 Info.offset = 0;
4242 Info.align.reset();
4243 return true;
4244 }
4245
4246 case Intrinsic::nvvm_prefetch_tensormap: {
4247 auto &DL = I.getDataLayout();
4248 Info.opc = ISD::INTRINSIC_VOID;
4249 Info.memVT = getPointerTy(DL);
4250 Info.ptrVal = I.getArgOperand(0);
4251 Info.offset = 0;
4252 Info.flags =
4254 Info.align.reset();
4255 return true;
4256 }
4257
4258 case Intrinsic::nvvm_ldu_global_i:
4259 case Intrinsic::nvvm_ldu_global_f:
4260 case Intrinsic::nvvm_ldu_global_p: {
4261 Info.opc = ISD::INTRINSIC_W_CHAIN;
4262 Info.memVT = getValueType(I.getDataLayout(), I.getType());
4263 Info.ptrVal = I.getArgOperand(0);
4264 Info.offset = 0;
4265 Info.flags = MachineMemOperand::MOLoad;
4266 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4267
4268 return true;
4269 }
4270 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4271 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4272 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4273 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4274 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4275 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4276 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4277 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4278 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4279 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4280 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4281 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4282 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4283 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4284 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4285 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4286 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4287 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4288 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4289 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4290 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4291 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4292 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4293 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4294 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4295 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4296 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4297 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4298 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4299 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4300 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4301 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4302 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4303 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4304 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4305 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4306 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4307 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4308 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4309 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4310 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4311 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4312 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4313 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4314 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4315 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4316 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4317 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4318 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4319 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4320 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4321 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4322 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4323 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4324 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4325 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4326 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4327 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4328 Info.opc = ISD::INTRINSIC_W_CHAIN;
4329 Info.memVT = MVT::v4f32;
4330 Info.ptrVal = nullptr;
4331 Info.offset = 0;
4332 Info.flags = MachineMemOperand::MOLoad;
4333 Info.align = Align(16);
4334 return true;
4335
4336 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4337 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4338 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4339 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4340 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4341 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4342 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4343 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4344 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4345 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4346 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4347 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4348 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4349 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4350 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4351 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4352 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4353 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4354 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4355 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4356 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4357 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4358 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4359 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4360 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4361 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4362 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4363 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4364 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4365 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4366 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4367 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4368 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4369 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4370 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4371 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4372 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4373 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4374 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4375 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4376 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4377 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4378 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4379 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4380 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4381 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4382 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4383 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4384 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4385 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4386 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4387 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4388 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4389 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4390 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4391 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4392 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4393 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4394 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4395 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4396 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4397 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4398 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4399 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4400 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4401 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4402 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4403 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4404 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4405 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4406 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4407 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4408 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4409 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4410 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4411 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4412 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4413 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4414 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4415 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4416 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4417 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4418 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4419 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4420 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4421 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4422 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4423 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4424 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4425 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4426 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4427 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4428 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4429 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4430 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4431 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4432 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4433 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4434 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4435 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4436 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4437 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4438 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4439 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4440 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4441 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4442 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4443 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4444 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4445 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4446 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4447 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4448 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4449 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4450 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4451 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4452 Info.opc = ISD::INTRINSIC_W_CHAIN;
4453 Info.memVT = MVT::v4i32;
4454 Info.ptrVal = nullptr;
4455 Info.offset = 0;
4456 Info.flags = MachineMemOperand::MOLoad;
4457 Info.align = Align(16);
4458 return true;
4459
4460 case Intrinsic::nvvm_suld_1d_i8_clamp:
4461 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4462 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4463 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4464 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4465 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4466 case Intrinsic::nvvm_suld_2d_i8_clamp:
4467 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4468 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4469 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4470 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4471 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4472 case Intrinsic::nvvm_suld_3d_i8_clamp:
4473 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4474 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4475 case Intrinsic::nvvm_suld_1d_i8_trap:
4476 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4477 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4478 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4479 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4480 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4481 case Intrinsic::nvvm_suld_2d_i8_trap:
4482 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4483 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4484 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4485 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4486 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4487 case Intrinsic::nvvm_suld_3d_i8_trap:
4488 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4489 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4490 case Intrinsic::nvvm_suld_1d_i8_zero:
4491 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4492 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4493 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4494 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4495 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4496 case Intrinsic::nvvm_suld_2d_i8_zero:
4497 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4498 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4499 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4500 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4501 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4502 case Intrinsic::nvvm_suld_3d_i8_zero:
4503 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4504 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4505 Info.opc = ISD::INTRINSIC_W_CHAIN;
4506 Info.memVT = MVT::i8;
4507 Info.ptrVal = nullptr;
4508 Info.offset = 0;
4509 Info.flags = MachineMemOperand::MOLoad;
4510 Info.align = Align(16);
4511 return true;
4512
4513 case Intrinsic::nvvm_suld_1d_i16_clamp:
4514 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4515 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4516 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4517 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4518 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4519 case Intrinsic::nvvm_suld_2d_i16_clamp:
4520 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4521 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4522 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4523 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4524 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4525 case Intrinsic::nvvm_suld_3d_i16_clamp:
4526 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4527 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4528 case Intrinsic::nvvm_suld_1d_i16_trap:
4529 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4530 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4531 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4532 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4533 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4534 case Intrinsic::nvvm_suld_2d_i16_trap:
4535 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4536 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4537 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4538 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4539 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4540 case Intrinsic::nvvm_suld_3d_i16_trap:
4541 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4542 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4543 case Intrinsic::nvvm_suld_1d_i16_zero:
4544 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4545 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4546 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4547 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4548 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4549 case Intrinsic::nvvm_suld_2d_i16_zero:
4550 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4551 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4552 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4553 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4554 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4555 case Intrinsic::nvvm_suld_3d_i16_zero:
4556 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4557 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4558 Info.opc = ISD::INTRINSIC_W_CHAIN;
4559 Info.memVT = MVT::i16;
4560 Info.ptrVal = nullptr;
4561 Info.offset = 0;
4562 Info.flags = MachineMemOperand::MOLoad;
4563 Info.align = Align(16);
4564 return true;
4565
4566 case Intrinsic::nvvm_suld_1d_i32_clamp:
4567 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4568 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4569 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4570 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4571 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4572 case Intrinsic::nvvm_suld_2d_i32_clamp:
4573 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4574 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4575 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4576 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4577 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4578 case Intrinsic::nvvm_suld_3d_i32_clamp:
4579 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4580 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4581 case Intrinsic::nvvm_suld_1d_i32_trap:
4582 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4583 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4584 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4585 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4586 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4587 case Intrinsic::nvvm_suld_2d_i32_trap:
4588 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4589 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4590 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4591 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4592 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4593 case Intrinsic::nvvm_suld_3d_i32_trap:
4594 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4595 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4596 case Intrinsic::nvvm_suld_1d_i32_zero:
4597 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4598 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4599 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4600 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4601 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4602 case Intrinsic::nvvm_suld_2d_i32_zero:
4603 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4604 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4605 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4606 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4607 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4608 case Intrinsic::nvvm_suld_3d_i32_zero:
4609 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4610 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4611 Info.opc = ISD::INTRINSIC_W_CHAIN;
4612 Info.memVT = MVT::i32;
4613 Info.ptrVal = nullptr;
4614 Info.offset = 0;
4615 Info.flags = MachineMemOperand::MOLoad;
4616 Info.align = Align(16);
4617 return true;
4618
4619 case Intrinsic::nvvm_suld_1d_i64_clamp:
4620 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4621 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4622 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4623 case Intrinsic::nvvm_suld_2d_i64_clamp:
4624 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4625 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4626 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4627 case Intrinsic::nvvm_suld_3d_i64_clamp:
4628 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4629 case Intrinsic::nvvm_suld_1d_i64_trap:
4630 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4631 case Intrinsic::nvvm_suld_1d_array_i64_trap:
4632 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4633 case Intrinsic::nvvm_suld_2d_i64_trap:
4634 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4635 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4636 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4637 case Intrinsic::nvvm_suld_3d_i64_trap:
4638 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4639 case Intrinsic::nvvm_suld_1d_i64_zero:
4640 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4641 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4642 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4643 case Intrinsic::nvvm_suld_2d_i64_zero:
4644 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4645 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4646 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4647 case Intrinsic::nvvm_suld_3d_i64_zero:
4648 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4649 Info.opc = ISD::INTRINSIC_W_CHAIN;
4650 Info.memVT = MVT::i64;
4651 Info.ptrVal = nullptr;
4652 Info.offset = 0;
4653 Info.flags = MachineMemOperand::MOLoad;
4654 Info.align = Align(16);
4655 return true;
4656
4657 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
4658 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
4659 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: {
4660 Info.opc = ISD::INTRINSIC_W_CHAIN;
4661 Info.memVT = MVT::v1i32;
4662 Info.ptrVal = I.getArgOperand(0);
4663 Info.offset = 0;
4664 Info.flags = MachineMemOperand::MOLoad;
4665 Info.align.reset();
4666 return true;
4667 }
4668
4669 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
4670 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
4671 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
4672 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: {
4673 Info.opc = ISD::INTRINSIC_W_CHAIN;
4674 Info.memVT = MVT::v2i32;
4675 Info.ptrVal = I.getArgOperand(0);
4676 Info.offset = 0;
4677 Info.flags = MachineMemOperand::MOLoad;
4678 Info.align.reset();
4679 return true;
4680 }
4681
4682 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
4683 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
4684 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
4685 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
4686 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: {
4687 Info.opc = ISD::INTRINSIC_W_CHAIN;
4688 Info.memVT = MVT::v4i32;
4689 Info.ptrVal = I.getArgOperand(0);
4690 Info.offset = 0;
4691 Info.flags = MachineMemOperand::MOLoad;
4692 Info.align.reset();
4693 return true;
4694 }
4695
4696 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
4697 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
4698 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
4699 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
4700 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8: {
4701 Info.opc = ISD::INTRINSIC_W_CHAIN;
4702 Info.memVT = MVT::v8i32;
4703 Info.ptrVal = I.getArgOperand(0);
4704 Info.offset = 0;
4705 Info.flags = MachineMemOperand::MOLoad;
4706 Info.align.reset();
4707 return true;
4708 }
4709
4710 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
4711 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
4712 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
4713 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
4714 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16: {
4715 Info.opc = ISD::INTRINSIC_W_CHAIN;
4716 Info.memVT = MVT::v16i32;
4717 Info.ptrVal = I.getArgOperand(0);
4718 Info.offset = 0;
4719 Info.flags = MachineMemOperand::MOLoad;
4720 Info.align.reset();
4721 return true;
4722 }
4723
4724 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
4725 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
4726 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
4727 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
4728 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: {
4729 Info.opc = ISD::INTRINSIC_W_CHAIN;
4730 Info.memVT = MVT::v32i32;
4731 Info.ptrVal = I.getArgOperand(0);
4732 Info.offset = 0;
4733 Info.flags = MachineMemOperand::MOLoad;
4734 Info.align.reset();
4735 return true;
4736 }
4737
4738 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
4739 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
4740 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
4741 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
4742 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: {
4743 Info.opc = ISD::INTRINSIC_W_CHAIN;
4744 Info.memVT = MVT::v64i32;
4745 Info.ptrVal = I.getArgOperand(0);
4746 Info.offset = 0;
4747 Info.flags = MachineMemOperand::MOLoad;
4748 Info.align.reset();
4749 return true;
4750 }
4751
4752 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
4753 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
4754 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
4755 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
4756 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
4757 Info.opc = ISD::INTRINSIC_W_CHAIN;
4758 Info.memVT = MVT::v128i32;
4759 Info.ptrVal = I.getArgOperand(0);
4760 Info.offset = 0;
4761 Info.flags = MachineMemOperand::MOLoad;
4762 Info.align.reset();
4763 return true;
4764 }
4765
4766 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
4767 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
4768 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {
4769 Info.opc = ISD::INTRINSIC_VOID;
4770 Info.memVT = MVT::i32;
4771 Info.ptrVal = I.getArgOperand(0);
4772 Info.offset = 0;
4773 Info.flags = MachineMemOperand::MOStore;
4774 Info.align.reset();
4775 return true;
4776 }
4777
4778 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
4779 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
4780 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
4781 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: {
4782 Info.opc = ISD::INTRINSIC_VOID;
4783 Info.memVT = MVT::v2i32;
4784 Info.ptrVal = I.getArgOperand(0);
4785 Info.offset = 0;
4786 Info.flags = MachineMemOperand::MOStore;
4787 Info.align.reset();
4788 return true;
4789 }
4790
4791 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
4792 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
4793 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
4794 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
4795 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: {
4796 Info.opc = ISD::INTRINSIC_VOID;
4797 Info.memVT = MVT::v4i32;
4798 Info.ptrVal = I.getArgOperand(0);
4799 Info.offset = 0;
4800 Info.flags = MachineMemOperand::MOStore;
4801 Info.align.reset();
4802 return true;
4803 }
4804
4805 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
4806 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
4807 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
4808 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
4809 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: {
4810 Info.opc = ISD::INTRINSIC_VOID;
4811 Info.memVT = MVT::v8i32;
4812 Info.ptrVal = I.getArgOperand(0);
4813 Info.offset = 0;
4814 Info.flags = MachineMemOperand::MOStore;
4815 Info.align.reset();
4816 return true;
4817 }
4818
4819 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
4820 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
4821 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
4822 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
4823 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: {
4824 Info.opc = ISD::INTRINSIC_VOID;
4825 Info.memVT = MVT::v16i32;
4826 Info.ptrVal = I.getArgOperand(0);
4827 Info.offset = 0;
4828 Info.flags = MachineMemOperand::MOStore;
4829 Info.align.reset();
4830 return true;
4831 }
4832
4833 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
4834 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
4835 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
4836 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
4837 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: {
4838 Info.opc = ISD::INTRINSIC_VOID;
4839 Info.memVT = MVT::v32i32;
4840 Info.ptrVal = I.getArgOperand(0);
4841 Info.offset = 0;
4842 Info.flags = MachineMemOperand::MOStore;
4843 Info.align.reset();
4844 return true;
4845 }
4846
4847 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
4848 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
4849 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
4850 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
4851 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: {
4852 Info.opc = ISD::INTRINSIC_VOID;
4853 Info.memVT = MVT::v64i32;
4854 Info.ptrVal = I.getArgOperand(0);
4855 Info.offset = 0;
4856 Info.flags = MachineMemOperand::MOStore;
4857 Info.align.reset();
4858 return true;
4859 }
4860
4861 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
4862 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
4863 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
4864 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
4865 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
4866 Info.opc = ISD::INTRINSIC_VOID;
4867 Info.memVT = MVT::v128i32;
4868 Info.ptrVal = I.getArgOperand(0);
4869 Info.offset = 0;
4870 Info.flags = MachineMemOperand::MOStore;
4871 Info.align.reset();
4872 return true;
4873 }
4874 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
4875 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
4876 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
4877 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
4878 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
4879 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
4880 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
4881 case Intrinsic::
4882 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
4883 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
4884 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
4885 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
4886 case Intrinsic::
4887 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: {
4888 // We are reading and writing back to TMem
4889 Info.opc = ISD::INTRINSIC_VOID;
4890 Info.memVT = MVT::v4i32;
4891 Info.ptrVal = I.getArgOperand(0);
4892 Info.offset = 0;
4894 Info.align = Align(16);
4895 return true;
4896 }
4897
4898 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
4899 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
4900 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
4901 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
4902 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
4903 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
4904 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
4905 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
4906 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
4907 case Intrinsic::
4908 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
4909 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
4910 case Intrinsic::
4911 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: {
4912 // We are reading and writing back to TMem
4913 Info.opc = ISD::INTRINSIC_VOID;
4914 Info.memVT = MVT::v8i32;
4915 Info.ptrVal = I.getArgOperand(0);
4916 Info.offset = 0;
4918 Info.align = Align(16);
4919 return true;
4920 }
4921 }
4922 return false;
4923}
4924
4925/// getFunctionParamOptimizedAlign - since function arguments are passed via
4926/// .param space, we may want to increase their alignment in a way that
4927/// ensures that we can effectively vectorize their loads & stores. We can
4928/// increase alignment only if the function has internal or has private
4929/// linkage as for other linkage types callers may already rely on default
4930/// alignment. To allow using 128-bit vectorized loads/stores, this function
4931/// ensures that alignment is 16 or greater.
4933 const Function *F, Type *ArgTy, const DataLayout &DL) const {
4934 // Capping the alignment to 128 bytes as that is the maximum alignment
4935 // supported by PTX.
4936 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
4937
4938 // If a function has linkage different from internal or private, we
4939 // must use default ABI alignment as external users rely on it. Same
4940 // for a function that may be called from a function pointer.
4941 if (!F || !F->hasLocalLinkage() ||
4942 F->hasAddressTaken(/*Users=*/nullptr,
4943 /*IgnoreCallbackUses=*/false,
4944 /*IgnoreAssumeLikeCalls=*/true,
4945 /*IgnoreLLVMUsed=*/true))
4946 return ABITypeAlign;
4947
4948 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
4949 return std::max(Align(16), ABITypeAlign);
4950}
4951
4952/// Helper for computing alignment of a device function byval parameter.
4954 const Function *F, Type *ArgTy, Align InitialAlign,
4955 const DataLayout &DL) const {
4956 Align ArgAlign = InitialAlign;
4957 // Try to increase alignment to enhance vectorization options.
4958 if (F)
4959 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
4960
4961 // Old ptx versions have a bug. When PTX code takes address of
4962 // byval parameter with alignment < 4, ptxas generates code to
4963 // spill argument into memory. Alas on sm_50+ ptxas generates
4964 // SASS code that fails with misaligned access. To work around
4965 // the problem, make sure that we align byval parameters by at
4966 // least 4. This bug seems to be fixed at least starting from
4967 // ptxas > 9.0.
4968 // TODO: remove this after verifying the bug is not reproduced
4969 // on non-deprecated ptxas versions.
4971 ArgAlign = std::max(ArgAlign, Align(4));
4972
4973 return ArgAlign;
4974}
4975
4976// Helper for getting a function parameter name. Name is composed from
4977// its index and the function name. Negative index corresponds to special
4978// parameter (unsized array) used for passing variable arguments.
4980 int Idx) const {
4981 std::string ParamName;
4982 raw_string_ostream ParamStr(ParamName);
4983
4984 ParamStr << getTargetMachine().getSymbol(F)->getName();
4985 if (Idx < 0)
4986 ParamStr << "_vararg";
4987 else
4988 ParamStr << "_param_" << Idx;
4989
4990 return ParamName;
4991}
4992
4993/// isLegalAddressingMode - Return true if the addressing mode represented
4994/// by AM is legal for this target, for a load/store of the specified type.
4995/// Used to guide target specific optimizations, like loop strength reduction
4996/// (LoopStrengthReduce.cpp) and memory optimization for address mode
4997/// (CodeGenPrepare.cpp)
4999 const AddrMode &AM, Type *Ty,
5000 unsigned AS, Instruction *I) const {
5001 // AddrMode - This represents an addressing mode of:
5002 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5003 //
5004 // The legal address modes are
5005 // - [avar]
5006 // - [areg]
5007 // - [areg+immoff]
5008 // - [immAddr]
5009
5010 // immoff must fit in a signed 32-bit int
5011 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5012 return false;
5013
5014 if (AM.BaseGV)
5015 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5016
5017 switch (AM.Scale) {
5018 case 0: // "r", "r+i" or "i" is allowed
5019 break;
5020 case 1:
5021 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5022 return false;
5023 // Otherwise we have r+i.
5024 break;
5025 default:
5026 // No scale > 1 is allowed
5027 return false;
5028 }
5029 return true;
5030}
5031
5032//===----------------------------------------------------------------------===//
5033// NVPTX Inline Assembly Support
5034//===----------------------------------------------------------------------===//
5035
5036/// getConstraintType - Given a constraint letter, return the type of
5037/// constraint it is for this target.
5040 if (Constraint.size() == 1) {
5041 switch (Constraint[0]) {
5042 default:
5043 break;
5044 case 'b':
5045 case 'r':
5046 case 'h':
5047 case 'c':
5048 case 'l':
5049 case 'f':
5050 case 'd':
5051 case 'q':
5052 case '0':
5053 case 'N':
5054 return C_RegisterClass;
5055 }
5056 }
5057 return TargetLowering::getConstraintType(Constraint);
5058}
5059
5060std::pair<unsigned, const TargetRegisterClass *>
5062 StringRef Constraint,
5063 MVT VT) const {
5064 if (Constraint.size() == 1) {
5065 switch (Constraint[0]) {
5066 case 'b':
5067 return std::make_pair(0U, &NVPTX::B1RegClass);
5068 case 'c':
5069 case 'h':
5070 return std::make_pair(0U, &NVPTX::B16RegClass);
5071 case 'r':
5072 case 'f':
5073 return std::make_pair(0U, &NVPTX::B32RegClass);
5074 case 'l':
5075 case 'N':
5076 case 'd':
5077 return std::make_pair(0U, &NVPTX::B64RegClass);
5078 case 'q': {
5079 if (STI.getSmVersion() < 70)
5080 report_fatal_error("Inline asm with 128 bit operands is only "
5081 "supported for sm_70 and higher!");
5082 return std::make_pair(0U, &NVPTX::B128RegClass);
5083 }
5084 }
5085 }
5086 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5087}
5088
5089//===----------------------------------------------------------------------===//
5090// NVPTX DAG Combining
5091//===----------------------------------------------------------------------===//
5092
5094 CodeGenOptLevel OptLevel) const {
5095 // Always honor command-line argument
5096 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5097 return FMAContractLevelOpt > 0;
5098
5099 // Do not contract if we're not optimizing the code.
5100 if (OptLevel == CodeGenOptLevel::None)
5101 return false;
5102
5103 // Honor TargetOptions flags that explicitly say fusion is okay.
5105 return true;
5106
5107 return false;
5108}
5109
5110static bool isConstZero(const SDValue &Operand) {
5111 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5112 return Const && Const->getZExtValue() == 0;
5113}
5114
5115/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5116/// operands N0 and N1. This is a helper for PerformADDCombine that is
5117/// called with the default operands, and if that fails, with commuted
5118/// operands.
5119static SDValue
5122 EVT VT = N0.getValueType();
5123
5124 // Since integer multiply-add costs the same as integer multiply
5125 // but is more costly than integer add, do the fusion only when
5126 // the mul is only used in the add.
5127 // TODO: this may not be true for later architectures, consider relaxing this
5128 if (!N0.getNode()->hasOneUse())
5129 return SDValue();
5130
5131 // fold (add (select cond, 0, (mul a, b)), c)
5132 // -> (select cond, c, (add (mul a, b), c))
5133 //
5134 if (N0.getOpcode() == ISD::SELECT) {
5135 unsigned ZeroOpNum;
5136 if (isConstZero(N0->getOperand(1)))
5137 ZeroOpNum = 1;
5138 else if (isConstZero(N0->getOperand(2)))
5139 ZeroOpNum = 2;
5140 else
5141 return SDValue();
5142
5143 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5144 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5145 return SDValue();
5146
5147 SDLoc DL(N);
5148 SDValue Mul =
5149 DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
5150 SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
5151 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5152 ((ZeroOpNum == 1) ? N1 : MAD),
5153 ((ZeroOpNum == 1) ? MAD : N1));
5154 }
5155
5156 return SDValue();
5157}
5158
5159static SDValue
5162 CodeGenOptLevel OptLevel) {
5163 EVT VT = N0.getValueType();
5164 if (N0.getOpcode() == ISD::FMUL) {
5165 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5166 &DCI.DAG.getTargetLoweringInfo());
5167 if (!(TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||
5168 (N->getFlags().hasAllowContract() &&
5169 N0->getFlags().hasAllowContract())))
5170 return SDValue();
5171
5172 // For floating point:
5173 // Do the fusion only when the mul has less than 5 uses and all
5174 // are add.
5175 // The heuristic is that if a use is not an add, then that use
5176 // cannot be fused into fma, therefore mul is still needed anyway.
5177 // If there are more than 4 uses, even if they are all add, fusing
5178 // them will increase register pressue.
5179 //
5180 int numUses = 0;
5181 int nonAddCount = 0;
5182 for (const SDNode *User : N0.getNode()->users()) {
5183 numUses++;
5184 if (User->getOpcode() != ISD::FADD)
5185 ++nonAddCount;
5186 if (numUses >= 5)
5187 return SDValue();
5188 }
5189 if (nonAddCount) {
5190 int orderNo = N->getIROrder();
5191 int orderNo2 = N0.getNode()->getIROrder();
5192 // simple heuristics here for considering potential register
5193 // pressure, the logics here is that the differnce are used
5194 // to measure the distance between def and use, the longer distance
5195 // more likely cause register pressure.
5196 if (orderNo - orderNo2 < 500)
5197 return SDValue();
5198
5199 // Now, check if at least one of the FMUL's operands is live beyond the
5200 // node N, which guarantees that the FMA will not increase register
5201 // pressure at node N.
5202 bool opIsLive = false;
5203 const SDNode *left = N0.getOperand(0).getNode();
5204 const SDNode *right = N0.getOperand(1).getNode();
5205
5206 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5207 opIsLive = true;
5208
5209 if (!opIsLive)
5210 for (const SDNode *User : left->users()) {
5211 int orderNo3 = User->getIROrder();
5212 if (orderNo3 > orderNo) {
5213 opIsLive = true;
5214 break;
5215 }
5216 }
5217
5218 if (!opIsLive)
5219 for (const SDNode *User : right->users()) {
5220 int orderNo3 = User->getIROrder();
5221 if (orderNo3 > orderNo) {
5222 opIsLive = true;
5223 break;
5224 }
5225 }
5226
5227 if (!opIsLive)
5228 return SDValue();
5229 }
5230
5231 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5232 N0.getOperand(1), N1);
5233 }
5234
5235 return SDValue();
5236}
5237
5238/// Fold unpacking movs into a load by increasing the number of return values.
5239///
5240/// ex:
5241/// L: v2f16,ch = load <p>
5242/// a: f16 = extractelt L:0, 0
5243/// b: f16 = extractelt L:0, 1
5244/// use(a, b)
5245///
5246/// ...is turned into...
5247///
5248/// L: f16,f16,ch = LoadV2 <p>
5249/// use(L:0, L:1)
5250static SDValue
5252 // Don't run this optimization before the legalizer
5253 if (!DCI.isAfterLegalizeDAG())
5254 return SDValue();
5255
5256 EVT ElementVT = N->getValueType(0);
5257 // Avoid non-packed types and v4i8
5258 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5259 return SDValue();
5260
5261 SmallVector<SDNode *> DeadCopyToRegs;
5262
5263 // Check whether all outputs are either used by an extractelt or are
5264 // glue/chain nodes
5265 if (!all_of(N->uses(), [&](SDUse &U) {
5266 // Skip glue, chain nodes
5267 if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
5268 return true;
5269 if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
5270 if (N->getOpcode() != ISD::LOAD)
5271 return true;
5272 // Since this is an ISD::LOAD, check all extractelts are used. If
5273 // any are not used, we don't want to defeat another optimization that
5274 // will narrow the load.
5275 //
5276 // For example:
5277 //
5278 // L: v2f16,ch = load <p>
5279 // e0: f16 = extractelt L:0, 0
5280 // e1: f16 = extractelt L:0, 1 <-- unused
5281 // store e0
5282 //
5283 // Can be optimized by DAGCombiner to:
5284 //
5285 // L: f16,ch = load <p>
5286 // store L:0
5287 return !U.getUser()->use_empty();
5288 }
5289
5290 // Otherwise, this use prevents us from splitting a value.
5291 return false;
5292 }))
5293 return SDValue();
5294
5295 auto *LD = cast<MemSDNode>(N);
5296 SDLoc DL(LD);
5297
5298 // the new opcode after we double the number of operands
5299 NVPTXISD::NodeType Opcode;
5301 unsigned OldNumOutputs; // non-glue, non-chain outputs
5302 switch (LD->getOpcode()) {
5303 case ISD::LOAD:
5304 OldNumOutputs = 1;
5305 // Any packed type is legal, so the legalizer will not have lowered
5306 // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
5307 // here.
5308 Opcode = NVPTXISD::LoadV2;
5309 Operands.push_back(DCI.DAG.getIntPtrConstant(
5310 cast<LoadSDNode>(LD)->getExtensionType(), DL));
5311 break;
5312 case NVPTXISD::LoadV2:
5313 OldNumOutputs = 2;
5314 Opcode = NVPTXISD::LoadV4;
5315 break;
5316 case NVPTXISD::LoadV4:
5317 // V8 is only supported for f32. Don't forget, we're not changing the load
5318 // size here. This is already a 256-bit load.
5319 if (ElementVT != MVT::v2f32)
5320 return SDValue();
5321 OldNumOutputs = 4;
5322 Opcode = NVPTXISD::LoadV8;
5323 break;
5324 case NVPTXISD::LoadV8:
5325 // PTX doesn't support the next doubling of outputs
5326 return SDValue();
5327 }
5328
5329 // the non-glue, non-chain outputs in the new load
5330 const unsigned NewNumOutputs = OldNumOutputs * 2;
5331 SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());
5332 // add remaining chain and glue values
5333 NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
5334
5335 // Create the new load
5336 SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
5337 Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
5338 LD->getMemOperand());
5339
5340 // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
5341 // the outputs the same. These nodes will be optimized away in later
5342 // DAGCombiner iterations.
5344 for (unsigned I : seq(OldNumOutputs))
5345 Results.push_back(DCI.DAG.getBuildVector(
5346 ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
5347 // Add remaining chain and glue nodes
5348 for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
5349 Results.push_back(NewLoad.getValue(NewNumOutputs + I));
5350
5351 return DCI.DAG.getMergeValues(Results, DL);
5352}
5353
5354/// Fold packing movs into a store.
5355///
5356/// ex:
5357/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16
5358/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16
5359/// StoreV2 v1, v2
5360///
5361/// ...is turned into...
5362///
5363/// StoreV4 a, b, c, d
5366 unsigned Front, unsigned Back) {
5367 // We want to run this as late as possible since other optimizations may
5368 // eliminate the BUILD_VECTORs.
5369 if (!DCI.isAfterLegalizeDAG())
5370 return SDValue();
5371
5372 // Get the type of the operands being stored.
5373 EVT ElementVT = N->getOperand(Front).getValueType();
5374
5375 // Avoid non-packed types and v4i8
5376 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5377 return SDValue();
5378
5379 auto *ST = cast<MemSDNode>(N);
5380
5381 // The new opcode after we double the number of operands.
5382 NVPTXISD::NodeType Opcode;
5383 switch (N->getOpcode()) {
5384 case ISD::STORE:
5385 // Any packed type is legal, so the legalizer will not have lowered
5386 // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
5387 // it here.
5388 Opcode = NVPTXISD::StoreV2;
5389 break;
5390 case NVPTXISD::StoreV2:
5391 Opcode = NVPTXISD::StoreV4;
5392 break;
5393 case NVPTXISD::StoreV4:
5394 // V8 is only supported for f32. Don't forget, we're not changing the store
5395 // size here. This is already a 256-bit store.
5396 if (ElementVT != MVT::v2f32)
5397 return SDValue();
5398 Opcode = NVPTXISD::StoreV8;
5399 break;
5400 case NVPTXISD::StoreV8:
5401 // PTX doesn't support the next doubling of operands
5402 return SDValue();
5403 default:
5404 llvm_unreachable("Unhandled store opcode");
5405 }
5406
5407 // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered
5408 // their elements.
5409 SmallVector<SDValue, 4> Operands(N->ops().take_front(Front));
5410 for (SDValue BV : N->ops().drop_front(Front).drop_back(Back)) {
5411 if (BV.getOpcode() != ISD::BUILD_VECTOR)
5412 return SDValue();
5413
5414 // If the operand has multiple uses, this optimization can increase register
5415 // pressure.
5416 if (!BV.hasOneUse())
5417 return SDValue();
5418
5419 // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for
5420 // any signs they may be folded by some other pattern or rule.
5421 for (SDValue Op : BV->ops()) {
5422 // Peek through bitcasts
5423 if (Op.getOpcode() == ISD::BITCAST)
5424 Op = Op.getOperand(0);
5425
5426 // This may be folded into a PRMT.
5427 if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE &&
5428 Op->getOperand(0).getValueType() == MVT::i32)
5429 return SDValue();
5430
5431 // This may be folded into cvt.bf16x2
5432 if (Op.getOpcode() == ISD::FP_ROUND)
5433 return SDValue();
5434 }
5435 Operands.append({BV.getOperand(0), BV.getOperand(1)});
5436 }
5437 Operands.append(N->op_end() - Back, N->op_end());
5438
5439 // Now we replace the store
5440 return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
5441 ST->getMemoryVT(), ST->getMemOperand());
5442}
5443
5445 const NVPTXSubtarget &STI) {
5446
5447 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
5448 // Here is our chance to custom lower a store with a non-simple type.
5449 // Unfortunately, we can't do this in the legalizer because there is no
5450 // way to setOperationAction for an non-simple type.
5452 if (!ST->getValue().getValueType().isSimple())
5453 return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
5454 }
5455
5456 return combinePackingMovIntoStore(N, DCI, 1, 2);
5457}
5458
5460 const NVPTXSubtarget &STI) {
5461 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
5462 // Here is our chance to custom lower a load with a non-simple type.
5463 // Unfortunately, we can't do this in the legalizer because there is no
5464 // way to setOperationAction for an non-simple type.
5465 if (!N->getValueType(0).isSimple())
5466 return lowerLoadVector(N, DCI.DAG, STI);
5467 }
5468
5469 return combineUnpackingMovIntoLoad(N, DCI);
5470}
5471
5472/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5473///
5476 CodeGenOptLevel OptLevel) {
5477 if (OptLevel == CodeGenOptLevel::None)
5478 return SDValue();
5479
5480 SDValue N0 = N->getOperand(0);
5481 SDValue N1 = N->getOperand(1);
5482
5483 // Skip non-integer, non-scalar case
5484 EVT VT = N0.getValueType();
5485 if (VT.isVector() || VT != MVT::i32)
5486 return SDValue();
5487
5488 // First try with the default operand order.
5489 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
5490 return Result;
5491
5492 // If that didn't work, try again with the operands commuted.
5493 return PerformADDCombineWithOperands(N, N1, N0, DCI);
5494}
5495
5496/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5497///
5500 CodeGenOptLevel OptLevel) {
5501 SDValue N0 = N->getOperand(0);
5502 SDValue N1 = N->getOperand(1);
5503
5504 EVT VT = N0.getValueType();
5505 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
5506 return SDValue();
5507
5508 // First try with the default operand order.
5509 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
5510 return Result;
5511
5512 // If that didn't work, try again with the operands commuted.
5513 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
5514}
5515
5516/// Get 3-input version of a 2-input min/max opcode
5517static NVPTXISD::NodeType getMinMax3Opcode(unsigned MinMax2Opcode) {
5518 switch (MinMax2Opcode) {
5519 case ISD::FMAXNUM:
5520 case ISD::FMAXIMUMNUM:
5521 return NVPTXISD::FMAXNUM3;
5522 case ISD::FMINNUM:
5523 case ISD::FMINIMUMNUM:
5524 return NVPTXISD::FMINNUM3;
5525 case ISD::FMAXIMUM:
5526 return NVPTXISD::FMAXIMUM3;
5527 case ISD::FMINIMUM:
5528 return NVPTXISD::FMINIMUM3;
5529 default:
5530 llvm_unreachable("Invalid 2-input min/max opcode");
5531 }
5532}
5533
5534/// PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into
5535/// (fmaxnum3 a, b, c). Also covers other llvm min/max intrinsics.
5538 unsigned PTXVersion, unsigned SmVersion) {
5539
5540 // 3-input min/max requires PTX 8.8+ and SM_100+, and only supports f32s
5541 EVT VT = N->getValueType(0);
5542 if (VT != MVT::f32 || PTXVersion < 88 || SmVersion < 100)
5543 return SDValue();
5544
5545 SDValue Op0 = N->getOperand(0);
5546 SDValue Op1 = N->getOperand(1);
5547 unsigned MinMaxOp2 = N->getOpcode();
5548 NVPTXISD::NodeType MinMaxOp3 = getMinMax3Opcode(MinMaxOp2);
5549
5550 if (Op0.getOpcode() == MinMaxOp2 && Op0.hasOneUse()) {
5551 // (maxnum (maxnum a, b), c) -> (maxnum3 a, b, c)
5552 SDValue A = Op0.getOperand(0);
5553 SDValue B = Op0.getOperand(1);
5554 SDValue C = Op1;
5555 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
5556 } else if (Op1.getOpcode() == MinMaxOp2 && Op1.hasOneUse()) {
5557 // (maxnum a, (maxnum b, c)) -> (maxnum3 a, b, c)
5558 SDValue A = Op0;
5559 SDValue B = Op1.getOperand(0);
5560 SDValue C = Op1.getOperand(1);
5561 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
5562 }
5563 return SDValue();
5564}
5565
5568 CodeGenOptLevel OptLevel) {
5569 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5570
5571 // Don't do anything at less than -O2.
5572 if (OptLevel < CodeGenOptLevel::Default)
5573 return SDValue();
5574
5575 SelectionDAG &DAG = DCI.DAG;
5576 SDLoc DL(N);
5577 EVT VT = N->getValueType(0);
5578 bool IsSigned = N->getOpcode() == ISD::SREM;
5579 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5580
5581 const SDValue &Num = N->getOperand(0);
5582 const SDValue &Den = N->getOperand(1);
5583
5584 for (const SDNode *U : Num->users()) {
5585 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5586 U->getOperand(1) == Den) {
5587 // Num % Den -> Num - (Num / Den) * Den
5588 return DAG.getNode(ISD::SUB, DL, VT, Num,
5589 DAG.getNode(ISD::MUL, DL, VT,
5590 DAG.getNode(DivOpc, DL, VT, Num, Den),
5591 Den));
5592 }
5593 }
5594 return SDValue();
5595}
5596
5597// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)
5599 CodeGenOptLevel OptLevel) {
5600 if (OptLevel == CodeGenOptLevel::None)
5601 return SDValue();
5602
5603 SDValue Op = N->getOperand(0);
5604 if (!Op.hasOneUse())
5605 return SDValue();
5606 EVT ToVT = N->getValueType(0);
5607 EVT FromVT = Op.getValueType();
5608 if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
5609 (ToVT == MVT::i64 && FromVT == MVT::i32)))
5610 return SDValue();
5611 if (!(Op.getOpcode() == ISD::MUL ||
5612 (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1)))))
5613 return SDValue();
5614
5615 SDLoc DL(N);
5616 unsigned ExtOpcode = N->getOpcode();
5617 unsigned Opcode = 0;
5618 if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap())
5620 else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap())
5622 else
5623 return SDValue();
5624 SDValue RHS = Op.getOperand(1);
5625 if (Op.getOpcode() == ISD::SHL) {
5626 const auto ShiftAmt = Op.getConstantOperandVal(1);
5627 const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt;
5628 RHS = DCI.DAG.getConstant(MulVal, DL, ToVT);
5629 }
5630 return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS);
5631}
5632
5638
5639/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5640/// that can be demoted to \p OptSize bits without loss of information. The
5641/// signedness of the operand, if determinable, is placed in \p S.
5643 unsigned OptSize,
5644 OperandSignedness &S) {
5645 S = Unknown;
5646
5647 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5648 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5649 EVT OrigVT = Op.getOperand(0).getValueType();
5650 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5651 S = Signed;
5652 return true;
5653 }
5654 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5655 EVT OrigVT = Op.getOperand(0).getValueType();
5656 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5657 S = Unsigned;
5658 return true;
5659 }
5660 }
5661
5662 return false;
5663}
5664
5665/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5666/// be demoted to \p OptSize bits without loss of information. If the operands
5667/// contain a constant, it should appear as the RHS operand. The signedness of
5668/// the operands is placed in \p IsSigned.
5670 unsigned OptSize,
5671 bool &IsSigned) {
5672 OperandSignedness LHSSign;
5673
5674 // The LHS operand must be a demotable op
5675 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5676 return false;
5677
5678 // We should have been able to determine the signedness from the LHS
5679 if (LHSSign == Unknown)
5680 return false;
5681
5682 IsSigned = (LHSSign == Signed);
5683
5684 // The RHS can be a demotable op or a constant
5686 const APInt &Val = CI->getAPIntValue();
5687 if (LHSSign == Unsigned) {
5688 return Val.isIntN(OptSize);
5689 } else {
5690 return Val.isSignedIntN(OptSize);
5691 }
5692 } else {
5693 OperandSignedness RHSSign;
5694 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5695 return false;
5696
5697 return LHSSign == RHSSign;
5698 }
5699}
5700
5701/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5702/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5703/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5704/// amount.
5707 EVT MulType = N->getValueType(0);
5708 if (MulType != MVT::i32 && MulType != MVT::i64) {
5709 return SDValue();
5710 }
5711
5712 SDLoc DL(N);
5713 unsigned OptSize = MulType.getSizeInBits() >> 1;
5714 SDValue LHS = N->getOperand(0);
5715 SDValue RHS = N->getOperand(1);
5716
5717 // Canonicalize the multiply so the constant (if any) is on the right
5718 if (N->getOpcode() == ISD::MUL) {
5719 if (isa<ConstantSDNode>(LHS)) {
5720 std::swap(LHS, RHS);
5721 }
5722 }
5723
5724 // If we have a SHL, determine the actual multiply amount
5725 if (N->getOpcode() == ISD::SHL) {
5727 if (!ShlRHS) {
5728 return SDValue();
5729 }
5730
5731 APInt ShiftAmt = ShlRHS->getAPIntValue();
5732 unsigned BitWidth = MulType.getSizeInBits();
5733 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5734 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5735 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5736 } else {
5737 return SDValue();
5738 }
5739 }
5740
5741 bool Signed;
5742 // Verify that our operands are demotable
5743 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5744 return SDValue();
5745 }
5746
5747 EVT DemotedVT;
5748 if (MulType == MVT::i32) {
5749 DemotedVT = MVT::i16;
5750 } else {
5751 DemotedVT = MVT::i32;
5752 }
5753
5754 // Truncate the operands to the correct size. Note that these are just for
5755 // type consistency and will (likely) be eliminated in later phases.
5756 SDValue TruncLHS =
5757 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5758 SDValue TruncRHS =
5759 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5760
5761 unsigned Opc;
5762 if (Signed) {
5764 } else {
5766 }
5767
5768 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5769}
5770
5771static bool isConstOne(const SDValue &Operand) {
5772 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5773 return Const && Const->getZExtValue() == 1;
5774}
5775
5777 if (Add->getOpcode() != ISD::ADD)
5778 return SDValue();
5779
5780 if (isConstOne(Add->getOperand(0)))
5781 return Add->getOperand(1);
5782
5783 if (isConstOne(Add->getOperand(1)))
5784 return Add->getOperand(0);
5785
5786 return SDValue();
5787}
5788
5791
5793 SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5794 return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
5795 }
5796
5797 return SDValue();
5798}
5799
5801 SDLoc DL,
5803 if (Select->getOpcode() != ISD::SELECT)
5804 return SDValue();
5805
5806 SDValue Cond = Select->getOperand(0);
5807
5808 unsigned ConstOpNo;
5809 if (isConstOne(Select->getOperand(1)))
5810 ConstOpNo = 1;
5811 else if (isConstOne(Select->getOperand(2)))
5812 ConstOpNo = 2;
5813 else
5814 return SDValue();
5815
5816 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
5817
5818 // Do not combine if the resulting sequence is not obviously profitable.
5820 return SDValue();
5821
5822 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5823
5824 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
5825 (ConstOpNo == 1) ? X : NewMul,
5826 (ConstOpNo == 1) ? NewMul : X);
5827}
5828
5829static SDValue
5832
5833 EVT VT = N0.getValueType();
5834 if (VT.isVector())
5835 return SDValue();
5836
5837 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
5838 return SDValue();
5839
5840 SDLoc DL(N);
5841
5842 // (mul x, (add y, 1)) -> (add (mul x, y), x)
5843 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
5844 return Res;
5845 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
5846 return Res;
5847
5848 // (mul x, (select y, 1)) -> (select (mul x, y), x)
5849 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
5850 return Res;
5851 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
5852 return Res;
5853
5854 return SDValue();
5855}
5856
5857/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5860 CodeGenOptLevel OptLevel) {
5861 if (OptLevel == CodeGenOptLevel::None)
5862 return SDValue();
5863
5864 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5865 return Ret;
5866
5867 SDValue N0 = N->getOperand(0);
5868 SDValue N1 = N->getOperand(1);
5869 return PerformMULCombineWithOperands(N, N0, N1, DCI);
5870}
5871
5872/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5875 CodeGenOptLevel OptLevel) {
5876 if (OptLevel > CodeGenOptLevel::None) {
5877 // Try mul.wide combining at OptLevel > 0
5878 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5879 return Ret;
5880 }
5881
5882 return SDValue();
5883}
5884
5887 unsigned int SmVersion) {
5888 EVT CCType = N->getValueType(0);
5889 SDValue A = N->getOperand(0);
5890 SDValue B = N->getOperand(1);
5891
5892 EVT AType = A.getValueType();
5893 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5894 return SDValue();
5895
5896 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5897 return SDValue();
5898
5899 SDLoc DL(N);
5900 // setp.f16x2 returns two scalar predicates, which we need to
5901 // convert back to v2i1. The returned result will be scalarized by
5902 // the legalizer, but the comparison will remain a single vector
5903 // instruction.
5904 SDValue CCNode = DCI.DAG.getNode(
5905 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5907 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5908 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5909 CCNode.getValue(1));
5910}
5911
5914 SDValue Vector = N->getOperand(0);
5915 if (Vector->getOpcode() == ISD::FREEZE)
5916 Vector = Vector->getOperand(0);
5917 SDLoc DL(N);
5918 EVT VectorVT = Vector.getValueType();
5919 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5920 IsPTXVectorType(VectorVT.getSimpleVT()))
5921 return SDValue(); // Native vector loads already combine nicely w/
5922 // extract_vector_elt.
5923 // Don't mess with singletons or packed types (v2f32, v2*16, v4i8 and v8i8),
5924 // we already handle them OK.
5925 if (VectorVT.getVectorNumElements() == 1 ||
5926 NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
5927 return SDValue();
5928
5929 // Don't mess with undef values as sra may be simplified to 0, not undef.
5930 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
5931 return SDValue();
5932
5933 uint64_t VectorBits = VectorVT.getSizeInBits();
5934 // We only handle the types we can extract in-register.
5935 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5936 return SDValue();
5937
5938 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5939 // Index == 0 is handled by generic DAG combiner.
5940 if (!Index || Index->getZExtValue() == 0)
5941 return SDValue();
5942
5943 MVT IVT = MVT::getIntegerVT(VectorBits);
5944 EVT EltVT = VectorVT.getVectorElementType();
5945 EVT EltIVT = EltVT.changeTypeToInteger();
5946 uint64_t EltBits = EltVT.getScalarSizeInBits();
5947
5948 SDValue Result = DCI.DAG.getNode(
5949 ISD::TRUNCATE, DL, EltIVT,
5950 DCI.DAG.getNode(
5951 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5952 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5953
5954 // If element has non-integer type, bitcast it back to the expected type.
5955 if (EltVT != EltIVT)
5956 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5957 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5958 if (EltVT != N->getValueType(0))
5959 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5960
5961 return Result;
5962}
5963
5966 SDValue VA = N->getOperand(1);
5967 EVT VectorVT = VA.getValueType();
5968 if (VectorVT != MVT::v4i8)
5969 return SDValue();
5970
5971 // We need to split vselect into individual per-element operations Because we
5972 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5973 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5974 // to/from i16 normally used for i8 values.
5976 SDLoc DL(N);
5977 SDValue VCond = N->getOperand(0);
5978 SDValue VB = N->getOperand(2);
5979 for (int I = 0; I < 4; ++I) {
5980 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5981 DCI.DAG.getConstant(I, DL, MVT::i32));
5982 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5983 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5984 DCI.DAG.getConstant(I, DL, MVT::i32)),
5985 DL, MVT::i32);
5986 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5987 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5988 DCI.DAG.getConstant(I, DL, MVT::i32)),
5989 DL, MVT::i32);
5990 E.push_back(DCI.DAG.getAnyExtOrTrunc(
5991 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5992 }
5993 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5994}
5995
5996static SDValue
5998 auto VT = N->getValueType(0);
5999 if (!DCI.isAfterLegalizeDAG() ||
6000 // only process v2*16 types
6001 !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&
6002 VT.getVectorNumElements() == 2))
6003 return SDValue();
6004
6005 auto Op0 = N->getOperand(0);
6006 auto Op1 = N->getOperand(1);
6007
6008 // Start out by assuming we want to take the lower 2 bytes of each i32
6009 // operand.
6010 uint64_t Op0Bytes = 0x10;
6011 uint64_t Op1Bytes = 0x54;
6012
6013 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
6014 {&Op1, &Op1Bytes}};
6015
6016 // Check that each operand is an i16, truncated from an i32 operand. We'll
6017 // select individual bytes from those original operands. Optionally, fold in a
6018 // shift right of that original operand.
6019 for (auto &[Op, OpBytes] : OpData) {
6020 // Eat up any bitcast
6021 if (Op->getOpcode() == ISD::BITCAST)
6022 *Op = Op->getOperand(0);
6023
6024 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
6025 Op->getOperand(0).getValueType() == MVT::i32))
6026 return SDValue();
6027
6028 // If the truncate has multiple uses, this optimization can increase
6029 // register pressure
6030 if (!Op->hasOneUse())
6031 return SDValue();
6032
6033 *Op = Op->getOperand(0);
6034
6035 // Optionally, fold in a shift-right of the original operand and let permute
6036 // pick the two higher bytes of the original value directly.
6037 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
6038 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
6039 // Shift the PRMT byte selector to pick upper bytes from each respective
6040 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
6041 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
6042 "PRMT selector values out of range");
6043 *OpBytes += 0x22;
6044 *Op = Op->getOperand(0);
6045 }
6046 }
6047 }
6048
6049 SDLoc DL(N);
6050 auto &DAG = DCI.DAG;
6051
6052 auto PRMT =
6053 getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
6054 (Op1Bytes << 8) | Op0Bytes, DL, DAG);
6055 return DAG.getBitcast(VT, PRMT);
6056}
6057
6060 auto *ASCN1 = cast<AddrSpaceCastSDNode>(N);
6061
6062 if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(ASCN1->getOperand(0))) {
6063 assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace());
6064
6065 // Fold asc[B -> A](asc[A -> B](x)) -> x
6066 if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace())
6067 return ASCN2->getOperand(0);
6068 }
6069
6070 return SDValue();
6071}
6072
6073// Given a constant selector value and a prmt mode, return the selector value
6074// normalized to the generic prmt mode. See the PTX ISA documentation for more
6075// details:
6076// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
6077static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
6078 assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6079
6081 return Selector;
6082
6083 const unsigned V = Selector.trunc(2).getZExtValue();
6084
6085 const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
6086 unsigned S3) {
6087 return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
6088 };
6089
6090 switch (Mode) {
6092 return GetSelector(V, V + 1, V + 2, V + 3);
6094 return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
6096 return GetSelector(V, V, V, V);
6098 return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
6100 return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
6102 unsigned V1 = (V & 1) << 1;
6103 return GetSelector(V1, V1 + 1, V1, V1 + 1);
6104 }
6105 default:
6106 llvm_unreachable("Invalid PRMT mode");
6107 }
6108}
6109
6110static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
6111 assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
6112 Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6113 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6114 APInt BitField = B.concat(A);
6115 APInt SelectorVal = getPRMTSelector(Selector, Mode);
6116 APInt Result(32, 0);
6117 for (unsigned I : llvm::seq(4U)) {
6118 APInt Sel = SelectorVal.extractBits(4, I * 4);
6119 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6120 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6121 APInt Byte = BitField.extractBits(8, Idx * 8);
6122 if (Sign)
6123 Byte = Byte.ashr(8);
6124 Result.insertBits(Byte, I * 8);
6125 }
6126 return Result;
6127}
6128
6130 CodeGenOptLevel OptLevel) {
6131 if (OptLevel == CodeGenOptLevel::None)
6132 return SDValue();
6133
6134 // Constant fold PRMT
6135 if (isa<ConstantSDNode>(N->getOperand(0)) &&
6136 isa<ConstantSDNode>(N->getOperand(1)) &&
6137 isa<ConstantSDNode>(N->getOperand(2)))
6138 return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
6139 N->getConstantOperandAPInt(1),
6140 N->getConstantOperandAPInt(2),
6141 N->getConstantOperandVal(3)),
6142 SDLoc(N), N->getValueType(0));
6143 return SDValue();
6144}
6145
6146// During call lowering we wrap the return values in a ProxyReg node which
6147// depend on the chain value produced by the completed call. This ensures that
6148// the full call is emitted in cases where libcalls are used to legalize
6149// operations. To improve the functioning of other DAG combines we pull all
6150// operations we can through one of these nodes, ensuring that the ProxyReg
6151// directly wraps a load. That is:
6152//
6153// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
6154//
6157 switch (R.getOpcode()) {
6158 case ISD::TRUNCATE:
6159 case ISD::ANY_EXTEND:
6160 case ISD::SIGN_EXTEND:
6161 case ISD::ZERO_EXTEND:
6162 case ISD::BITCAST: {
6163 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6164 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
6165 return SDValue();
6166 }
6167 case ISD::SHL:
6168 case ISD::SRL:
6169 case ISD::SRA:
6170 case ISD::OR: {
6171 if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
6172 if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
6173 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
6174 return SDValue();
6175 }
6176 case ISD::Constant:
6177 return R;
6178 case ISD::LOAD:
6179 case NVPTXISD::LoadV2:
6180 case NVPTXISD::LoadV4: {
6181 return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
6182 {Chain, R});
6183 }
6184 case ISD::BUILD_VECTOR: {
6185 if (DCI.isBeforeLegalize())
6186 return SDValue();
6187
6189 for (auto &Op : R->ops()) {
6190 SDValue V = sinkProxyReg(Op, Chain, DCI);
6191 if (!V)
6192 return SDValue();
6193 Ops.push_back(V);
6194 }
6195 return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
6196 }
6198 if (DCI.isBeforeLegalize())
6199 return SDValue();
6200
6201 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6203 R.getValueType(), V, R.getOperand(1));
6204 return SDValue();
6205 }
6206 default:
6207 return SDValue();
6208 }
6209}
6210
6213
6214 SDValue Chain = N->getOperand(0);
6215 SDValue Reg = N->getOperand(1);
6216
6217 // If the ProxyReg is not wrapping a load, try to pull the operations through
6218 // the ProxyReg.
6219 if (Reg.getOpcode() != ISD::LOAD) {
6220 if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
6221 return V;
6222 }
6223
6224 return SDValue();
6225}
6226
6227SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
6228 DAGCombinerInfo &DCI) const {
6230 switch (N->getOpcode()) {
6231 default:
6232 break;
6233 case ISD::ADD:
6234 return PerformADDCombine(N, DCI, OptLevel);
6235 case ISD::ADDRSPACECAST:
6236 return combineADDRSPACECAST(N, DCI);
6237 case ISD::SIGN_EXTEND:
6238 case ISD::ZERO_EXTEND:
6239 return combineMulWide(N, DCI, OptLevel);
6240 case ISD::BUILD_VECTOR:
6241 return PerformBUILD_VECTORCombine(N, DCI);
6243 return PerformEXTRACTCombine(N, DCI);
6244 case ISD::FADD:
6245 return PerformFADDCombine(N, DCI, OptLevel);
6246 case ISD::FMAXNUM:
6247 case ISD::FMINNUM:
6248 case ISD::FMAXIMUM:
6249 case ISD::FMINIMUM:
6250 case ISD::FMAXIMUMNUM:
6251 case ISD::FMINIMUMNUM:
6252 return PerformFMinMaxCombine(N, DCI, STI.getPTXVersion(),
6253 STI.getSmVersion());
6254 case ISD::LOAD:
6255 case NVPTXISD::LoadV2:
6256 case NVPTXISD::LoadV4:
6257 return combineLOAD(N, DCI, STI);
6258 case ISD::MUL:
6259 return PerformMULCombine(N, DCI, OptLevel);
6260 case NVPTXISD::PRMT:
6261 return combinePRMT(N, DCI, OptLevel);
6262 case NVPTXISD::ProxyReg:
6263 return combineProxyReg(N, DCI);
6264 case ISD::SETCC:
6265 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
6266 case ISD::SHL:
6267 return PerformSHLCombine(N, DCI, OptLevel);
6268 case ISD::SREM:
6269 case ISD::UREM:
6270 return PerformREMCombine(N, DCI, OptLevel);
6271 case ISD::STORE:
6272 case NVPTXISD::StoreV2:
6273 case NVPTXISD::StoreV4:
6274 return combineSTORE(N, DCI, STI);
6275 case ISD::VSELECT:
6276 return PerformVSELECTCombine(N, DCI);
6277 }
6278 return SDValue();
6279}
6280
6283 // Handle bitcasting to v2i8 without hitting the default promotion
6284 // strategy which goes through stack memory.
6285 SDValue Op(Node, 0);
6286 EVT ToVT = Op->getValueType(0);
6287 if (ToVT != MVT::v2i8) {
6288 return;
6289 }
6290
6291 // Bitcast to i16 and unpack elements into a vector
6292 SDLoc DL(Node);
6293 SDValue AsInt = DAG.getBitcast(MVT::i16, Op->getOperand(0));
6294 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
6295 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
6296 SDValue Vec1 =
6297 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6298 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
6299 Results.push_back(
6300 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
6301}
6302
6303// Lower vector return type of tcgen05.ld intrinsics
6306 bool hasOffset = false) {
6307 SDLoc DL(N);
6308 EVT ResVT = N->getValueType(0);
6309 if (!ResVT.isVector())
6310 return; // already legalized.
6311
6312 const unsigned NumElts = ResVT.getVectorNumElements();
6313
6314 // Create the return type of the instructions
6315 SmallVector<EVT, 5> ListVTs;
6316 for (unsigned i = 0; i < NumElts; ++i)
6317 ListVTs.push_back(MVT::i32);
6318
6319 ListVTs.push_back(N->getValueType(1)); // Chain
6320
6321 SDVTList ResVTs = DAG.getVTList(ListVTs);
6322
6323 SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
6324 N->getOperand(2)};
6325
6326 if (hasOffset) {
6327 Ops.push_back(N->getOperand(3)); // offset
6328 Ops.push_back(N->getOperand(4)); // Pack flag
6329 } else
6330 Ops.push_back(N->getOperand(3)); // Pack flag
6331
6333 SDValue NewNode =
6335 MemSD->getMemoryVT(), MemSD->getMemOperand());
6336
6337 // split the vector result
6338 SmallVector<SDValue, 4> ScalarRes;
6339 for (unsigned i = 0; i < NumElts; ++i) {
6340 SDValue Res = NewNode.getValue(i);
6341 ScalarRes.push_back(Res);
6342 }
6343
6344 SDValue Chain = NewNode.getValue(NumElts);
6345 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
6346 Results.push_back(BuildVector); // Build Vector
6347 Results.push_back(Chain); // Chain
6348}
6349
6352 SDValue Chain = N->getOperand(0);
6353 SDValue Intrin = N->getOperand(1);
6354 SDLoc DL(N);
6355
6356 // Get the intrinsic ID
6357 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
6358 switch (IntrinNo) {
6359 default:
6360 return;
6361 case Intrinsic::nvvm_ldu_global_i:
6362 case Intrinsic::nvvm_ldu_global_f:
6363 case Intrinsic::nvvm_ldu_global_p: {
6364 EVT ResVT = N->getValueType(0);
6365
6366 if (ResVT.isVector()) {
6367 // Vector LDG/LDU
6368
6369 unsigned NumElts = ResVT.getVectorNumElements();
6370 EVT EltVT = ResVT.getVectorElementType();
6371
6372 // Since LDU/LDG are target nodes, we cannot rely on DAG type
6373 // legalization.
6374 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6375 // loaded type to i16 and propagate the "real" type as the memory type.
6376 bool NeedTrunc = false;
6377 if (EltVT.getSizeInBits() < 16) {
6378 EltVT = MVT::i16;
6379 NeedTrunc = true;
6380 }
6381
6382 unsigned Opcode = 0;
6383 SDVTList LdResVTs;
6384
6385 switch (NumElts) {
6386 default:
6387 return;
6388 case 2:
6389 Opcode = NVPTXISD::LDUV2;
6390 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6391 break;
6392 case 4: {
6393 Opcode = NVPTXISD::LDUV4;
6394 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6395 LdResVTs = DAG.getVTList(ListVTs);
6396 break;
6397 }
6398 }
6399
6400 SmallVector<SDValue, 8> OtherOps;
6401
6402 // Copy regular operands
6403
6404 OtherOps.push_back(Chain); // Chain
6405 // Skip operand 1 (intrinsic ID)
6406 // Others
6407 OtherOps.append(N->op_begin() + 2, N->op_end());
6408
6410
6411 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6412 MemSD->getMemoryVT(),
6413 MemSD->getMemOperand());
6414
6415 SmallVector<SDValue, 4> ScalarRes;
6416
6417 for (unsigned i = 0; i < NumElts; ++i) {
6418 SDValue Res = NewLD.getValue(i);
6419 if (NeedTrunc)
6420 Res =
6421 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6422 ScalarRes.push_back(Res);
6423 }
6424
6425 SDValue LoadChain = NewLD.getValue(NumElts);
6426
6427 SDValue BuildVec =
6428 DAG.getBuildVector(ResVT, DL, ScalarRes);
6429
6430 Results.push_back(BuildVec);
6431 Results.push_back(LoadChain);
6432 } else {
6433 // i8 LDG/LDU
6434 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6435 "Custom handling of non-i8 ldu/ldg?");
6436
6437 // Just copy all operands as-is
6439
6440 // Force output to i16
6441 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6442
6444
6445 // We make sure the memory type is i8, which will be used during isel
6446 // to select the proper instruction.
6447 SDValue NewLD =
6449 MVT::i8, MemSD->getMemOperand());
6450
6451 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6452 NewLD.getValue(0)));
6453 Results.push_back(NewLD.getValue(1));
6454 }
6455 return;
6456 }
6457
6458 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
6459 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
6460 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
6461 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
6462 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
6463 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
6464 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
6465 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
6466 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
6467 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
6468 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
6469 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
6470 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
6471 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
6472 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
6473 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
6474 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
6475 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
6476 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
6477 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
6478 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
6479 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
6480 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
6481 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
6482 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
6483 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
6484 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
6485 return ReplaceTcgen05Ld(N, DAG, Results);
6486
6487 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
6488 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
6489 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
6490 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
6491 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
6492 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
6493 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
6494 return ReplaceTcgen05Ld(N, DAG, Results, /* Offset */ true);
6495 }
6496}
6497
6500 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
6501 // result so that it can pass the legalization
6502 SDLoc DL(N);
6503 SDValue Chain = N->getOperand(0);
6504 SDValue Reg = N->getOperand(1);
6505 SDValue Glue = N->getOperand(2);
6506
6507 assert(Reg.getValueType() == MVT::i128 &&
6508 "Custom lowering for CopyFromReg with 128-bit reg only");
6509 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
6510 N->getValueType(2)};
6511 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
6512
6513 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
6514 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
6515 {NewValue.getValue(0), NewValue.getValue(1)});
6516
6517 Results.push_back(Pair);
6518 Results.push_back(NewValue.getValue(2));
6519 Results.push_back(NewValue.getValue(3));
6520}
6521
6523 const TargetLowering &TLI,
6525 SDValue Chain = N->getOperand(0);
6526 SDValue Reg = N->getOperand(1);
6527
6528 MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
6529
6530 SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
6531 SDValue NewProxy =
6532 DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
6533 SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
6534
6535 Results.push_back(Res);
6536}
6537
6539 const NVPTXSubtarget &STI,
6541 assert(N->getValueType(0) == MVT::i128 &&
6542 "Custom lowering for atomic128 only supports i128");
6543
6545 SDLoc dl(N);
6546
6547 if (!STI.hasAtomSwap128()) {
6550 "Support for b128 atomics introduced in PTX ISA version 8.3 and "
6551 "requires target sm_90.",
6552 dl.getDebugLoc()));
6553
6554 Results.push_back(DAG.getUNDEF(MVT::i128));
6555 Results.push_back(AN->getOperand(0)); // Chain
6556 return;
6557 }
6558
6560 Ops.push_back(AN->getOperand(0)); // Chain
6561 Ops.push_back(AN->getOperand(1)); // Ptr
6562 for (const auto &Op : AN->ops().drop_front(2)) {
6563 // Low part
6564 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6565 DAG.getIntPtrConstant(0, dl)));
6566 // High part
6567 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6568 DAG.getIntPtrConstant(1, dl)));
6569 }
6570 unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
6573 SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
6574 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
6575 AN->getMemOperand());
6576 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
6577 {Result.getValue(0), Result.getValue(1)}));
6578 Results.push_back(Result.getValue(2));
6579}
6580
6581void NVPTXTargetLowering::ReplaceNodeResults(
6583 switch (N->getOpcode()) {
6584 default:
6585 report_fatal_error("Unhandled custom legalization");
6586 case ISD::BITCAST:
6587 ReplaceBITCAST(N, DAG, Results);
6588 return;
6589 case ISD::LOAD:
6590 replaceLoadVector(N, DAG, Results, STI);
6591 return;
6594 return;
6595 case ISD::CopyFromReg:
6597 return;
6598 case NVPTXISD::ProxyReg:
6599 replaceProxyReg(N, DAG, *this, Results);
6600 return;
6601 case ISD::ATOMIC_CMP_SWAP:
6602 case ISD::ATOMIC_SWAP:
6603 replaceAtomicSwap128(N, DAG, STI, Results);
6604 return;
6605 }
6606}
6607
6610 Type *Ty = AI->getValOperand()->getType();
6611
6612 if (AI->isFloatingPointOperation()) {
6614 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6615 STI.getPTXVersion() >= 63)
6617 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
6618 STI.getPTXVersion() >= 78)
6620 if (Ty->isFloatTy())
6622 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6624 }
6626 }
6627
6628 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6629 const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
6630
6631 switch (AI->getOperation()) {
6632 default:
6635 if (BitWidth == 128)
6641 switch (BitWidth) {
6642 case 8:
6643 case 16:
6645 case 32:
6647 case 64:
6648 if (STI.hasAtomBitwise64())
6651 case 128:
6653 default:
6654 llvm_unreachable("unsupported width encountered");
6655 }
6662 switch (BitWidth) {
6663 case 8:
6664 case 16:
6666 case 32:
6668 case 64:
6669 if (STI.hasAtomMinMax64())
6672 case 128:
6674 default:
6675 llvm_unreachable("unsupported width encountered");
6676 }
6679 switch (BitWidth) {
6680 case 32:
6682 case 8:
6683 case 16:
6684 case 64:
6685 case 128:
6687 default:
6688 llvm_unreachable("unsupported width encountered");
6689 }
6690 }
6691
6693}
6694
6696 const Instruction *I) const {
6697 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6698 // When CAS bitwidth is not supported on the hardware, the CAS is emulated
6699 // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
6700 // the memory order using explicit fences around the retry loop.
6701 // The memory order of natively supported CAS operations can be enforced
6702 // by lowering to an atom.cas with the right memory synchronizing effect.
6703 // However, atom.cas only supports relaxed, acquire, release and acq_rel.
6704 // So we also use explicit fences for enforcing memory order for
6705 // seq_cast CAS with natively-supported bitwidths.
6706 return CI &&
6707 (cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
6708 STI.getMinCmpXchgSizeInBits() ||
6709 CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
6710}
6711
6713 const Instruction *I) const {
6714 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6715 bool BitwidthSupportedAndIsSeqCst =
6716 CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
6717 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
6718 STI.getMinCmpXchgSizeInBits();
6719 return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
6721}
6722
6724 Instruction *Inst,
6725 AtomicOrdering Ord) const {
6726 if (!isa<AtomicCmpXchgInst>(Inst))
6727 return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
6728
6729 // Specialize for cmpxchg
6730 // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
6731 SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID();
6732 if (isReleaseOrStronger(Ord))
6733 return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
6734 ? Ord
6736 SSID);
6737
6738 return nullptr;
6739}
6740
6742 Instruction *Inst,
6743 AtomicOrdering Ord) const {
6744 // Specialize for cmpxchg
6745 if (!isa<AtomicCmpXchgInst>(Inst))
6746 return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
6747
6748 auto *CI = cast<AtomicCmpXchgInst>(Inst);
6749 auto CASWidth =
6750 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth();
6751 SyncScope::ID SSID = CI->getSyncScopeID();
6752 // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
6753 if (isAcquireOrStronger(Ord) &&
6755 CASWidth < STI.getMinCmpXchgSizeInBits()))
6756 return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
6757
6758 return nullptr;
6759}
6760
6761// Rather than default to SINT when both UINT and SINT are custom, we only
6762// change the opcode when UINT is not legal and SINT is. UINT is preferred when
6763// both are custom since unsigned CVT instructions can lead to slightly better
6764// SASS code with fewer instructions.
6766 EVT ToVT) const {
6767 if (isOperationLegal(Op, ToVT))
6768 return Op;
6769 switch (Op) {
6770 case ISD::FP_TO_UINT:
6772 return ISD::FP_TO_SINT;
6773 break;
6777 break;
6778 case ISD::VP_FP_TO_UINT:
6779 if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
6780 return ISD::VP_FP_TO_SINT;
6781 break;
6782 default:
6783 break;
6784 }
6785 return Op;
6786}
6787
6788// Pin NVPTXTargetObjectFile's vtables to this file.
6790
6795
6797 const SelectionDAG &DAG, unsigned Depth) {
6798 SDValue A = Op.getOperand(0);
6799 SDValue B = Op.getOperand(1);
6800 ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6801 unsigned Mode = Op.getConstantOperandVal(3);
6802
6803 if (!Selector)
6804 return;
6805
6806 KnownBits AKnown = DAG.computeKnownBits(A, Depth);
6807 KnownBits BKnown = DAG.computeKnownBits(B, Depth);
6808
6809 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6810 assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
6811 "PRMT must have i32 operands");
6812 assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
6813 KnownBits BitField = BKnown.concat(AKnown);
6814
6815 APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
6816 for (unsigned I : llvm::seq(4)) {
6817 APInt Sel = SelectorVal.extractBits(4, I * 4);
6818 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6819 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6820 KnownBits Byte = BitField.extractBits(8, Idx * 8);
6821 if (Sign)
6822 Byte = KnownBits::ashr(Byte, 8);
6823 Known.insertBits(Byte, I * 8);
6824 }
6825}
6826
6827static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known) {
6829
6830 // We can't do anything without knowing the sign bit.
6831 auto ExtType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
6832 if (ExtType == ISD::SEXTLOAD)
6833 return;
6834
6835 // ExtLoading to vector types is weird and may not work well with known bits.
6836 auto DestVT = LD->getValueType(0);
6837 if (DestVT.isVector())
6838 return;
6839
6840 assert(Known.getBitWidth() == DestVT.getSizeInBits());
6841 auto ElementBitWidth = NVPTXDAGToDAGISel::getFromTypeWidthForLoad(LD);
6842 Known.Zero.setHighBits(Known.getBitWidth() - ElementBitWidth);
6843}
6844
6846 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
6847 const SelectionDAG &DAG, unsigned Depth) const {
6848 Known.resetAll();
6849
6850 switch (Op.getOpcode()) {
6851 case NVPTXISD::PRMT:
6852 computeKnownBitsForPRMT(Op, Known, DAG, Depth);
6853 break;
6854 case NVPTXISD::LoadV2:
6855 case NVPTXISD::LoadV4:
6856 case NVPTXISD::LoadV8:
6858 break;
6859 default:
6860 break;
6861 }
6862}
6863
6864static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
6865 const APInt &DemandedBits) {
6866 APInt DemandedLHS = APInt(32, 0);
6867 APInt DemandedRHS = APInt(32, 0);
6868
6869 for (unsigned I : llvm::seq(4)) {
6870 if (DemandedBits.extractBits(8, I * 8).isZero())
6871 continue;
6872
6873 APInt Sel = SelectorVal.extractBits(4, I * 4);
6874 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6875 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6876
6877 APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
6878 unsigned ByteStart = (Idx % 4) * 8;
6879 if (Sign)
6880 Src.setBit(ByteStart + 7);
6881 else
6882 Src.setBits(ByteStart, ByteStart + 8);
6883 }
6884
6885 return {DemandedLHS, DemandedRHS};
6886}
6887
6888// Replace undef with 0 as this is easier for other optimizations such as
6889// known bits.
6891 if (!Op)
6892 return SDValue();
6893 if (Op.isUndef())
6894 return DAG.getConstant(0, SDLoc(), MVT::i32);
6895 return Op;
6896}
6897
6899 const APInt &DemandedBits,
6900 SelectionDAG &DAG,
6901 const TargetLowering &TLI,
6902 unsigned Depth) {
6903 assert(PRMT.getOpcode() == NVPTXISD::PRMT);
6904 SDValue Op0 = PRMT.getOperand(0);
6905 SDValue Op1 = PRMT.getOperand(1);
6906 auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
6907 if (!SelectorConst)
6908 return SDValue();
6909
6910 unsigned Mode = PRMT.getConstantOperandVal(3);
6911 const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
6912
6913 // Try to simplify the PRMT to one of the inputs if the used bytes are all
6914 // from the same input in the correct order.
6915 const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
6916 const unsigned SelBits = (4 - LeadingBytes) * 4;
6917 if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
6918 return Op0;
6919 if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
6920 return Op1;
6921
6922 auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
6923
6924 // Attempt to avoid multi-use ops if we don't need anything from them.
6925 SDValue DemandedOp0 =
6926 TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
6927 SDValue DemandedOp1 =
6928 TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
6929
6930 DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
6931 DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
6932 if ((DemandedOp0 && DemandedOp0 != Op0) ||
6933 (DemandedOp1 && DemandedOp1 != Op1)) {
6934 Op0 = DemandedOp0 ? DemandedOp0 : Op0;
6935 Op1 = DemandedOp1 ? DemandedOp1 : Op1;
6936 return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
6937 }
6938
6939 return SDValue();
6940}
6941
6943 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
6944 KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
6945 Known.resetAll();
6946
6947 switch (Op.getOpcode()) {
6948 case NVPTXISD::PRMT:
6950 *this, Depth)) {
6951 TLO.CombineTo(Op, Result);
6952 return true;
6953 }
6954 break;
6955 default:
6956 break;
6957 }
6958
6959 computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
6960 return false;
6961}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition Compiler.h:404
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file contains the declarations of entities that describe floating point environment and related ...
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register Reg
Register const TargetRegisterInfo * TRI
#define T
NVPTX address space definition.
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
static SDValue combineADDRSPACECAST(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< NVPTX::DivPrecisionLevel > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specific: Override the precision of the lowering for f32 fdiv"), cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"), clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", "Use IEEE Compliant F32 div.rnd if available (default)"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3", "Use IEEE Compliant F32 div.rnd if available, no FTZ")), cl::init(NVPTX::DivPrecisionLevel::IEEE754))
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static void ReplaceTcgen05Ld(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results, bool hasOffset=false)
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG)
static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, const DataLayout &DL, const TargetLowering &TL)
static SDValue lowerROT(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > &Offsets, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive legal-ish MVTs that compose ...
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI, SmallVectorImpl< SDValue > &Results)
static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back)
Fold packing movs into a store.
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, SelectionDAG &DAG, T GetElement)
static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static NVPTXISD::NodeType getMinMax3Opcode(unsigned MinMax2Opcode)
Get 3-input version of a 2-input min/max opcode.
static unsigned canMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment)
static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C)
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG)
static SDValue PerformFMinMaxCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned PTXVersion, unsigned SmVersion)
PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into (fmaxnum3 a, b, c).
static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG)
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static unsigned getTcgen05MMADisableOutputLane(unsigned IID)
static std::pair< APInt, APInt > getPRMTDemandedBits(const APInt &SelectorVal, const APInt &DemandedBits)
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode)
static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Fold unpacking movs into a load by increasing the number of return values.
static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, SelectionDAG &DAG)
static std::optional< NVPTXISD::NodeType > getScalar3OpcodeForReduction(unsigned ReductionOpcode)
Get 3-input scalar reduction opcode.
static std::optional< std::pair< SDValue, SDValue > > replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
replaceLoadVector - Convert vector loads into multi-output scalar loads.
static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, unsigned Opcode, SelectionDAG &DAG)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG)
static SDValue LowerTcgen05St(SDValue Op, SelectionDAG &DAG)
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue buildTreeReduction(const SmallVector< SDValue > &Elements, EVT EltTy, ArrayRef< std::pair< unsigned, unsigned > > Ops, const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG)
Reduces the elements using the scalar operations provided.
static SDValue combineProxyReg(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SmallVector< unsigned, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode=NVPTX::PTXPrmtMode::NONE)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG)
static SDValue correctParamType(SDValue V, EVT ExpectedVT, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known)
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode)
static EVT promoteScalarIntegerPTX(const EVT VT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, const APInt &DemandedBits, SelectionDAG &DAG, const TargetLowering &TLI, unsigned Depth)
static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG)
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG)
static SDValue sinkProxyReg(SDValue R, SDValue Chain, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG)
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static std::optional< std::pair< unsigned int, MVT > > getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace)
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG)
static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
MachineInstr unsigned OpIdx
uint64_t High
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
BinaryOperator * Mul
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1130
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & back() const
back - Get the last element.
Definition ArrayRef.h:156
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
@ UDecWrap
Decrement one until a minimum value or zero.
bool isFloatingPointOperation() const
BinOp getOperation() const
This is an SDNode representing atomic operations.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
FunctionType * getFunctionType() const
This class represents a function call, abstracting a target machine's calling convention.
const APInt & getAPIntValue() const
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:521
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static auto fp_fixedlen_vector_valuetypes()
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
bool has256BitVectorLoadStore(unsigned AS) const
AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, std::optional< unsigned > FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool shouldInsertFencesForAtomic(const Instruction *) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI Align getEVTAlign(EVT MemoryVT) const
Compute the default alignment value for the given type.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getBasicBlock(MachineBasicBlock *MBB)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
ArrayRef< int > getMask() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:140
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Inserts in the IR a target-specific intrinsic specifying a fence.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
TargetLowering(const TargetLowering &)=delete
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetFrameLowering * getFrameLowering() const
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition APInt.cpp:3155
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition ISDOpcodes.h:379
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
@ TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED
@ TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ CALL
This node represents a PTX call instruction.
@ TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y
@ TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ DeclareScalarParam
These nodes represent a parameter declaration.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
@ BUILD_VECTOR
This node is similar to ISD::BUILD_VECTOR except that the output may be implicitly bitcast to a scala...
@ TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:251
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
This is an optimization pass for GlobalISel generic memory operations.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
bool isReleaseOrStronger(AtomicOrdering AO)
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1948
unsigned promoteScalarArgumentSize(unsigned size)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool shouldPassAsArray(Type *Ty)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
bool isKernelFunction(const Function &F)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition ValueTypes.h:197
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits concat(const KnownBits &Lo) const
Concatenate the bits from Lo onto the bottom of *this.
Definition KnownBits.h:233
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
void insertBits(const KnownBits &SubBits, unsigned BitPosition)
Insert the bits from a smaller known bits starting at bitPosition.
Definition KnownBits.h:219
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
Type * RetTy
Same as OrigRetTy, or partially legalized for soft float libcalls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...