23#include "llvm/IR/IntrinsicsNVPTX.h"
34#define DEBUG_TYPE "nvptx-isel"
35#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
39 cl::desc(
"Enable reciprocal sqrt optimization"));
46 cl::desc(
"Enable MAD wide optimization"));
75NVPTXDAGToDAGISel::getDivF32Level(
const SDNode *
N)
const {
79bool NVPTXDAGToDAGISel::usePrecSqrtF32(
const SDNode *
N)
const {
83bool NVPTXDAGToDAGISel::useF32FTZ()
const {
84 return Subtarget->getTargetLowering()->useF32FTZ(*
MF);
87bool NVPTXDAGToDAGISel::allowFMA()
const {
88 const NVPTXTargetLowering *TL =
Subtarget->getTargetLowering();
92bool NVPTXDAGToDAGISel::doRsqrtOpt()
const {
return EnableRsqrtOpt; }
94bool NVPTXDAGToDAGISel::doMADWideOpt()
const {
return EnableMADWide; }
98void NVPTXDAGToDAGISel::Select(
SDNode *
N) {
100 if (
N->isMachineOpcode()) {
105 switch (
N->getOpcode()) {
107 case ISD::ATOMIC_LOAD:
112 case ISD::ATOMIC_STORE:
116 case ISD::ATOMIC_FENCE:
124 if (tryEXTRACT_VECTOR_ELEMENT(
N))
131 SelectSETP_BF16X2(
N);
136 if (tryLoadVector(
N))
147 if (tryStoreVector(
N))
151 if (tryIntrinsicChain(
N))
155 if (tryIntrinsicVoid(
N))
165 case ISD::ADDRSPACECAST:
166 SelectAddrSpaceCast(
N);
169 if (
N->getOperand(1).getValueType() == MVT::i128) {
170 SelectV2I64toI128(
N);
176 if (
N->getOperand(1).getValueType() == MVT::i128) {
177 SelectI128toV2I64(
N);
184 selectAtomicSwap128(
N);
189 if (tryBF16ArithToFMA(
N))
198#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
199 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
200 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
204 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
206 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
208 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
210 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
212 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
214 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
216 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
218 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
220 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
222 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
224 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
226 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
228 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
230 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
232 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
234 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
236 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
238 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
240 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
242 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
244 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
246 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
248 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
250 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
252 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
254 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
256 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
258 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
260 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
262 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
264 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
266 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
268 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
270 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
276 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
282void NVPTXDAGToDAGISel::SelectTcgen05Ld(
SDNode *
N,
bool hasOffset) {
288 auto OffsetNode =
CurDAG->getTargetConstant(
292 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
297 {N->getOperand(2), N->getOperand(0)}));
301bool NVPTXDAGToDAGISel::tryIntrinsicChain(
SDNode *
N) {
302 unsigned IID =
N->getConstantOperandVal(1);
306 case Intrinsic::nvvm_ldu_global_f:
307 case Intrinsic::nvvm_ldu_global_i:
308 case Intrinsic::nvvm_ldu_global_p:
311 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
312 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
313 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
314 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
315 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
316 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
317 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
318 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
319 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
320 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
321 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
322 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
323 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
324 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
325 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
326 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
327 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
328 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
329 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
330 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
331 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
332 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
333 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
334 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
335 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
336 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
337 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
338 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
339 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
344 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
345 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
346 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
347 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
348 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
349 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
350 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
351 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
352 SelectTcgen05Ld(
N,
true);
387 return CmpMode::NotANumber;
402 return CurDAG->getTargetConstant(PTXCmpMode, SDLoc(), MVT::i32);
405bool NVPTXDAGToDAGISel::SelectSETP_F16X2(
SDNode *
N) {
408 SDNode *SetP =
CurDAG->getMachineNode(
409 NVPTX::SETP_f16x2rr,
DL, MVT::i1, MVT::i1,
410 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
411 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
416bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(
SDNode *
N) {
419 SDNode *SetP =
CurDAG->getMachineNode(
420 NVPTX::SETP_bf16x2rr,
DL, MVT::i1, MVT::i1,
421 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
422 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
427bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(
SDNode *
N) {
429 MVT EltVT =
N->getSimpleValueType(0);
432 CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(
N), EltVT, EltVT,
Vector);
440bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(
SDNode *
N) {
443 MVT VT =
Vector.getSimpleValueType();
449 Opcode = NVPTX::I32toV2I16;
451 Opcode = NVPTX::I64toV2I32;
457 for (
auto *U :
Vector.getNode()->users()) {
460 if (
U->getOperand(0) !=
Vector)
462 if (
const ConstantSDNode *IdxConst =
464 if (IdxConst->getZExtValue() == 0)
466 else if (IdxConst->getZExtValue() == 1)
482 CurDAG->getMachineNode(Opcode, SDLoc(
N), EltVT, EltVT,
Vector);
483 for (
auto *Node : E0)
485 for (
auto *Node : E1)
491static std::optional<NVPTX::AddressSpace>
convertAS(
unsigned AS) {
513 return convertAS(
N->getMemOperand()->getAddrSpace())
521 auto Ordering =
N->getMergedOrdering();
545 return Scopes[
N->getSyncScopeID()];
550struct OperationOrderings {
551 NVPTX::Ordering InstructionOrdering, FenceOrdering;
552 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
553 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
554 : InstructionOrdering(IO), FenceOrdering(FO) {}
557static OperationOrderings
659 !HasMemoryOrdering) {
661 formatv(
"PTX does not support \"atomic\" for orderings different than"
662 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
674 bool AddrGenericOrGlobalOrShared =
679 if (!AddrGenericOrGlobalOrShared)
682 bool UseRelaxedMMIO =
704 formatv(
"PTX only supports Acquire Ordering on reads: {}",
705 N->getOperationName()));
710 formatv(
"PTX only supports Release Ordering on writes: {}",
711 N->getOperationName()));
715 formatv(
"NVPTX does not support AcquireRelease Ordering on "
717 "yet and PTX does not support it on loads or stores: {}",
718 N->getOperationName()));
731 else if (
N->writeMem())
735 formatv(
"NVPTX does not support SequentiallyConsistent Ordering on "
736 "read-modify-writes yet: {}",
737 N->getOperationName()));
738 return OperationOrderings(InstrOrder,
743 formatv(
"NVPTX backend does not support AtomicOrdering \"{}\" yet.",
766 auto S = Scopes[
N->getSyncScopeID()];
775 Subtarget->failIfClustersUnsupported(
"cluster scope");
794 T->failIfClustersUnsupported(
".cluster scope fence");
797 if (!
T->hasSplitAcquireAndReleaseFences() &&
805 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
806 : NVPTX::INT_MEMBAR_SYS;
808 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
809 : NVPTX::INT_MEMBAR_CTA;
811 return NVPTX::atomic_thread_fence_acquire_cluster;
813 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
814 : NVPTX::INT_MEMBAR_GL;
818 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
825 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
826 : NVPTX::INT_MEMBAR_SYS;
828 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
829 : NVPTX::INT_MEMBAR_CTA;
831 return NVPTX::atomic_thread_fence_release_cluster;
833 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
834 : NVPTX::INT_MEMBAR_GL;
838 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
845 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
846 : NVPTX::INT_MEMBAR_SYS;
848 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
849 : NVPTX::INT_MEMBAR_CTA;
851 return NVPTX::atomic_thread_fence_acq_rel_cluster;
853 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
854 : NVPTX::INT_MEMBAR_GL;
858 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
866 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
867 : NVPTX::INT_MEMBAR_SYS;
869 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
870 : NVPTX::INT_MEMBAR_CTA;
872 return NVPTX::atomic_thread_fence_seq_cst_cluster;
874 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
875 : NVPTX::INT_MEMBAR_GL;
888 formatv(
"Unsupported \"{}\" ordering and \"{}\" scope for fence.",
889 OrderingToString(O), ScopeToString(S)));
897std::pair<NVPTX::Ordering, NVPTX::Scope>
898NVPTXDAGToDAGISel::insertMemoryInstructionFence(
SDLoc DL,
SDValue &Chain,
900 auto [InstructionOrdering, FenceOrdering] =
902 auto Scope = getOperationScope(
N, InstructionOrdering);
915 formatv(
"Unexpected fence ordering: \"{}\".",
918 return {InstructionOrdering,
Scope};
921void NVPTXDAGToDAGISel::SelectAddrSpaceCast(
SDNode *
N) {
927 assert(SrcAddrSpace != DstAddrSpace &&
928 "addrspacecast must be between different address spaces");
933 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
936 SDNode *Cvt =
CurDAG->getMachineNode(NVPTX::CVT_u64_u32,
DL, MVT::i64,
942 switch (SrcAddrSpace) {
945 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
948 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
953 "Shared cluster address space is only supported in 64-bit mode");
954 Opc = NVPTX::cvta_shared_cluster_64;
957 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
960 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
963 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
970 if (SrcAddrSpace != 0)
973 switch (DstAddrSpace) {
976 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
979 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
984 "Shared cluster address space is only supported in 64-bit mode");
985 Opc = NVPTX::cvta_to_shared_cluster_64;
988 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
991 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
994 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
998 SDNode *CVTA =
CurDAG->getMachineNode(
Opc,
DL,
N->getValueType(0), Src);
999 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
1002 CVTA =
CurDAG->getMachineNode(NVPTX::CVT_u32_u64,
DL, MVT::i32,
1013static std::optional<unsigned>
1015 std::optional<unsigned> Opcode_i32,
1016 std::optional<unsigned> Opcode_i64) {
1035 return std::nullopt;
1040 return V.getOpcode() ==
ISD::ADD ||
1041 (V->getOpcode() ==
ISD::OR && V->getFlags().hasDisjoint());
1046 N =
N.getOperand(0);
1056 GA->getValueType(0), GA->getOffset(),
1057 GA->getTargetFlags());
1060 ES->getTargetFlags());
1069 APInt AccumulatedOffset(64u, 0);
1075 const APInt CI = CN->getAPIntValue().
sext(64);
1076 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1079 AccumulatedOffset += CI;
1105bool NVPTXDAGToDAGISel::tryLoad(
SDNode *
N) {
1107 assert(
LD->readMem() &&
"Expected load");
1111 if (PlainLoad && PlainLoad->
isIndexed())
1121 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1123 const unsigned FromTypeWidth =
LD->getMemoryVT().getSizeInBits();
1132 FromTypeWidth <= 128 &&
"Invalid width for load");
1137 getI32Imm(Scope,
DL),
1138 getI32Imm(CodeAddrSpace,
DL),
1139 getI32Imm(FromType,
DL),
1140 getI32Imm(FromTypeWidth,
DL),
1146 const std::optional<unsigned> Opcode =
1147 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1151 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1155 MachineMemOperand *MemRef =
LD->getMemOperand();
1163 switch (
N->getOpcode()) {
1175bool NVPTXDAGToDAGISel::tryLoadVector(
SDNode *
N) {
1183 const MVT EltVT =
LD->getSimpleValueType(0);
1186 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1196 const unsigned ExtensionType =
1197 N->getConstantOperandVal(
N->getNumOperands() - 1);
1200 : NVPTX::PTXLdStInstCode::
Untyped;
1208 getI32Imm(Scope,
DL),
1209 getI32Imm(CodeAddrSpace,
DL),
1210 getI32Imm(FromType,
DL),
1211 getI32Imm(FromTypeWidth,
DL),
1216 std::optional<unsigned> Opcode;
1217 switch (
N->getOpcode()) {
1222 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1226 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1230 NVPTX::LDV_i32_v8, {});
1236 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1238 MachineMemOperand *MemRef =
LD->getMemOperand();
1245bool NVPTXDAGToDAGISel::tryLDG(
MemSDNode *LD) {
1248 unsigned ExtensionType;
1250 ExtensionType =
Load->getExtensionType();
1252 ExtensionType =
LD->getConstantOperandVal(
LD->getNumOperands() - 1);
1256 : NVPTX::PTXLdStInstCode::
Untyped;
1260 assert(!(
LD->getSimpleValueType(0).isVector() &&
1268 std::optional<unsigned> Opcode;
1269 switch (
LD->getOpcode()) {
1274 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1279 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1284 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1288 NVPTX::LD_GLOBAL_NC_v8i32, {});
1294 SDNode *NVPTXLDG =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1303 auto ElementBitWidth = TotalWidth / NumElts;
1305 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1306 "Invalid width for load");
1307 return ElementBitWidth;
1310bool NVPTXDAGToDAGISel::tryLDU(
SDNode *
N) {
1325 std::optional<unsigned> Opcode;
1326 switch (
N->getOpcode()) {
1331 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1335 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1339 NVPTX::LDU_GLOBAL_v4i32, {});
1345 SDNode *NVPTXLDU =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1351bool NVPTXDAGToDAGISel::tryStore(
SDNode *
N) {
1353 assert(
ST->writeMem() &&
"Expected store");
1356 assert((PlainStore || AtomicStore) &&
"Expected store");
1359 if (PlainStore && PlainStore->
isIndexed())
1367 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1370 const unsigned ToTypeWidth =
ST->getMemoryVT().getSizeInBits();
1376 "Invalid width for store");
1380 getI32Imm(Ordering,
DL),
1381 getI32Imm(Scope,
DL),
1382 getI32Imm(CodeAddrSpace,
DL),
1383 getI32Imm(ToTypeWidth,
DL),
1388 const std::optional<unsigned> Opcode =
1390 NVPTX::ST_i32, NVPTX::ST_i64);
1394 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1399 MachineMemOperand *MemRef =
ST->getMemOperand();
1405bool NVPTXDAGToDAGISel::tryStoreVector(
SDNode *
N) {
1407 const unsigned TotalWidth =
ST->getMemoryVT().getSizeInBits();
1418 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1423 for (
auto &V :
ST->ops().slice(1, NumElts))
1424 Ops.push_back(selectPossiblyImm(V));
1426 const unsigned ToTypeWidth = TotalWidth / NumElts;
1429 TotalWidth <= 256 &&
"Invalid width for store");
1432 Ops.append({getI32Imm(Ordering,
DL), getI32Imm(Scope,
DL),
1433 getI32Imm(CodeAddrSpace,
DL), getI32Imm(ToTypeWidth,
DL),
Base,
1437 ST->getOperand(1).getSimpleValueType().SimpleTy;
1438 std::optional<unsigned> Opcode;
1439 switch (
ST->getOpcode()) {
1459 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1461 MachineMemOperand *MemRef =
ST->getMemOperand();
1470bool NVPTXDAGToDAGISel::tryBFE(
SDNode *
N) {
1477 bool IsSigned =
false;
1493 uint64_t MaskVal =
Mask->getZExtValue();
1503 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1507 Val =
LHS.getNode()->getOperand(0);
1508 Start =
LHS.getNode()->getOperand(1);
1514 int64_t GoodBits =
Start.getValueSizeInBits() - StartVal;
1515 if (NumBits > GoodBits) {
1573 NumBits = NumZeros + NumOnes - ShiftAmt;
1579 if (ShiftAmt < NumZeros) {
1587 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1603 Val =
LHS->getOperand(0);
1622 if (OuterShiftAmt < InnerShiftAmt) {
1633 Start =
CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt,
DL,
1658 Opc = NVPTX::BFE_S32rii;
1660 Opc = NVPTX::BFE_U32rii;
1664 Opc = NVPTX::BFE_S64rii;
1666 Opc = NVPTX::BFE_U64rii;
1682bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(
SDNode *
N) {
1683 EVT VT =
SDValue(
N, 0).getValueType();
1687 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1704 auto API = APF.bitcastToAPInt();
1705 API = API.concat(API);
1707 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_B32_i,
DL, VT, Const),
1711 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_BF16_i,
DL, VT, Const), 0);
1714 switch (
N->getOpcode()) {
1717 Operands = {N0, GetConstant(1.0), N1};
1721 Operands = {N1, GetConstant(-1.0), N0};
1726 Operands = {N0, N1, GetConstant(-0.0)};
1732 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1739 if (
V.getOpcode() == ISD::BITCAST)
1740 V =
V.getOperand(0);
1743 return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
1746 return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
1755 std::vector<SDValue> &OutOps) {
1756 switch (ConstraintID) {
1761 OutOps.push_back(
Base);
1762 OutOps.push_back(
Offset);
1769void NVPTXDAGToDAGISel::SelectV2I64toI128(
SDNode *
N) {
1788 NewOps[0] =
N->getOperand(0);
1791 if (
N->getNumOperands() == 5)
1792 NewOps[3] =
N->getOperand(4);
1798void NVPTXDAGToDAGISel::SelectI128toV2I64(
SDNode *
N) {
1815 SDNode *Mov =
CurDAG->getMachineNode(
1816 NVPTX::I128toV2I64,
DL,
1823bool NVPTXDAGToDAGISel::tryFence(
SDNode *
N) {
1825 assert(
N->getOpcode() == ISD::ATOMIC_FENCE);
1826 unsigned int FenceOp =
1828 Scopes[
N->getConstantOperandVal(2)],
Subtarget);
1830 SDNode *FenceNode =
CurDAG->getMachineNode(FenceOp,
DL, MVT::Other, Chain);
1846 "NVPTXScopes::operator[]");
1848 auto S = Scopes.find(
ID);
1849 if (S == Scopes.end()) {
1861#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1863 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1864 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1866#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1867 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1868 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1870#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
1872 if (is_mc && is_ch) \
1873 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
1875 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
1877 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
1878 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
1898 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1919 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1926 bool IsCacheHint,
bool IsIm2Col) {
1931 IsCacheHint, IsShared32);
1934 IsCacheHint, IsShared32);
1937 IsCacheHint, IsShared32);
1940 "GetCpAsyncBulkTensorG2SOpcode.");
1946 IsCacheHint, IsShared32);
1949 IsCacheHint, IsShared32);
1952 IsCacheHint, IsShared32);
1955 IsCacheHint, IsShared32);
1958 IsCacheHint, IsShared32);
1961 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
1968 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
1969 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
1971 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
1972 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
1974 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
1975 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
1982void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(
SDNode *
N,
1990 size_t NumOps =
N->getNumOperands();
1994 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
1995 bool IsCacheHint =
N->getConstantOperandVal(
NumOps - 2) == 1;
1996 bool IsMultiCast =
N->getConstantOperandVal(
NumOps - 3) == 1;
1997 size_t NumBaseArgs = NumDims + NumOffsets + 3;
1998 size_t MultiCastIdx = NumBaseArgs + 2;
2000 unsigned CTAGroupVal =
N->getConstantOperandVal(
NumOps - 1);
2001 if ((CTAGroupVal > 0) && !
Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
2003 formatv(
"CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
2011 Ops.push_back(
N->getOperand(MultiCastIdx));
2015 Ops.push_back(
N->getOperand(MultiCastIdx + 1));
2018 Ops.push_back(getI32Imm(CTAGroupVal,
DL));
2021 Ops.push_back(
N->getOperand(0));
2026 NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
2030void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(
SDNode *
N,
2037 size_t NumOps =
N->getNumOperands();
2038 size_t NumDims =
NumOps - 6;
2039 bool IsCacheHint =
N->getConstantOperandVal(
NumOps - 1) == 1;
2040 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2);
2044 Ops.push_back(getI32Imm(RedOp,
DL));
2045 Ops.push_back(
N->getOperand(0));
2050 NumDims, IsShared32, IsCacheHint, IsIm2Col);
2054#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
2055 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
2056 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
2060 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2062 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2064 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2066 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2068 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2070 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2072 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2074 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2076 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2078 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2080 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2082 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2084 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2086 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2088 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2090 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2092 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2094 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2096 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2098 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2100 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2102 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2104 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2106 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2108 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2110 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2112 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2114 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2116 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2118 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2120 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2122 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2124 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2126 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2128 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2130 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2132 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2138void NVPTXDAGToDAGISel::SelectTcgen05St(
SDNode *
N,
bool hasOffset) {
2151 for (
unsigned I = hasOffset ? 4 : 3;
I < (
N->getNumOperands() - 1);
I++)
2163bool NVPTXDAGToDAGISel::tryIntrinsicVoid(
SDNode *
N) {
2164 unsigned IID =
N->getConstantOperandVal(1);
2166 auto CastTy = [](TMARedTy
Op) {
return static_cast<unsigned>(
Op); };
2170 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
2171 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
2172 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
2173 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
2174 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
2175 SelectCpAsyncBulkTensorG2SCommon(
N);
2177 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2178 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2179 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2180 SelectCpAsyncBulkTensorG2SCommon(
N,
true);
2182 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2183 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2184 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2185 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2186 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2187 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD));
2189 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2190 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2191 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2192 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD),
2195 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2196 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2197 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2198 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2199 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2200 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN));
2202 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2203 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2204 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2205 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN),
2208 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2209 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2210 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2211 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2212 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2213 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX));
2215 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2216 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2217 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2218 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX),
2221 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2222 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2223 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2224 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2225 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2226 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC));
2228 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2229 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2230 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2231 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC),
2234 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2235 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2236 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2237 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2238 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2239 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC));
2241 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2242 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2243 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2244 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC),
2247 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2248 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2249 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2250 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2251 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2252 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND));
2254 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2255 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2256 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2257 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND),
2260 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2261 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2262 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2263 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2264 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2265 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR));
2267 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2268 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2269 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2270 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR),
2273 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2274 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2275 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2276 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2277 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2278 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR));
2280 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2281 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2282 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2283 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR),
2287 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2288 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2289 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2290 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2291 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2292 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2293 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2294 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2295 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2296 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2297 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2298 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2299 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2300 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2301 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2302 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2303 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2304 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2305 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2306 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2307 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2308 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2309 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2310 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2311 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2312 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2313 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2314 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2315 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2320 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2321 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2322 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2323 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2324 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2325 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2326 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2327 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2328 SelectTcgen05St(
N,
true);
2334void NVPTXDAGToDAGISel::selectAtomicSwap128(
SDNode *
N) {
2341 Ops.append(
N->op_begin() + 2,
N->op_end());
2343 getI32Imm(getMemOrder(AN), dl),
2344 getI32Imm(getAtomicScope(AN), dl),
2352 ? NVPTX::ATOM_EXCH_B128
2353 : NVPTX::ATOM_CAS_B128;
2355 auto *ATOM =
CurDAG->getMachineNode(Opcode, dl,
N->getVTList(),
Ops);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
mir Rename Register Operands
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static size_t GetDimsFromIntrinsic(unsigned IID)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static cl::opt< bool > EnableMADWide("nvptx-mad-wide-opt", cl::init(false), cl::Hidden, cl::desc("Enable MAD wide optimization"))
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)
static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, bool IsMultiCast, bool IsCacheHint, bool IsIm2Col)
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::optional< NVPTX::AddressSpace > convertAS(unsigned AS)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Class for arbitrary precision integers.
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
int64_t getSExtValue() const
Get sign extended value.
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
const SDValue & getVal() const
uint64_t getZExtValue() const
FunctionPass class - This class is used to implement most global optimizations.
This is an important class for using LLVM in a threaded context.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
NVPTXDAGToDAGISel()=delete
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
const SDValue & getValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ ADD
Simple integer binary arithmetic operators.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ FADD
Simple binary floating point operators.
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SHL
Shift and rotation operations.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ ADDRESS_SPACE_SHARED_CLUSTER
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string ScopeToString(Scope S)
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
initializer< Ty > init(const Ty &Val)
Scope
Defines the scope in which this symbol should be visible: Default – Visible in the public interface o...
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Implement std::hash so that hash_code can be used in STL containers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
NVPTX::Scope operator[](SyncScope::ID ID) const