41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
53#define DEBUG_TYPE "si-lower"
59 cl::desc(
"Do not align and prefetch loops"),
63 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
64 cl::desc(
"Use indirect register addressing for divergent indexes"),
71 cl::desc(
"Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
86 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
87 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
89 return AMDGPU::SGPR0 +
Reg;
161 if (Subtarget->has16BitInsts()) {
162 if (Subtarget->useRealTrue16Insts()) {
204 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
205 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
206 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
207 MVT::i1, MVT::v32i32},
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
221 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
222 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
223 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
224 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
225 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
283 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
290 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
291 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
292 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
295 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
296 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
297 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
301 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
302 MVT::v3i16, MVT::v4i16, MVT::Other},
307 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
323 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
324 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
325 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
326 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
327 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
328 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
329 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
330 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
362 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
376 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
390 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
404 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
418 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
433 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
434 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
437 if (Subtarget->hasPkMovB32()) {
458 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
459 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
464 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
468 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
469 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
470 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
471 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
495 if (Subtarget->hasSMemRealTime() ||
500 if (Subtarget->has16BitInsts()) {
507 if (Subtarget->hasMadMacF32Insts())
510 if (!Subtarget->hasBFI())
514 if (!Subtarget->hasBCNT(32))
517 if (!Subtarget->hasBCNT(64))
520 if (Subtarget->hasFFBH())
523 if (Subtarget->hasFFBL())
534 if (Subtarget->hasBFE())
538 if (Subtarget->hasIntClamp())
541 if (Subtarget->hasAddNoCarry())
546 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
547 {MVT::f32, MVT::f64},
Custom);
553 {MVT::f32, MVT::f64},
Legal);
555 if (Subtarget->haveRoundOpsF64())
578 if (Subtarget->has16BitInsts()) {
627 ISD::FSIN, ISD::FROUND},
631 if (Subtarget->hasBF16TransInsts())
650 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
651 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
652 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
785 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
786 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
787 MVT::v32f16, MVT::v32bf16},
791 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
797 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
801 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
805 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
806 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
814 if (Subtarget->hasVOP3PInsts()) {
825 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
828 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
829 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
830 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
833 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
841 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
847 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
848 {MVT::v2f16, MVT::v4f16},
Custom);
854 if (Subtarget->hasBF16PackedInsts()) {
855 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
861 if (Subtarget->hasPackedFP32Ops()) {
865 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
872 if (Subtarget->has16BitInsts()) {
885 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
886 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
887 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
888 MVT::v32f16, MVT::v32bf16},
893 if (Subtarget->hasVectorMulU64())
895 else if (Subtarget->hasScalarSMulU64())
898 if (Subtarget->hasMad64_32())
901 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
904 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
906 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
909 if (Subtarget->hasMinimum3Maximum3F32())
912 if (Subtarget->hasMinimum3Maximum3PKF16()) {
916 if (!Subtarget->hasMinimum3Maximum3F16())
921 if (Subtarget->hasVOP3PInsts()) {
924 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
928 if (Subtarget->hasIntMinMax64())
933 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
934 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
939 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
940 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
941 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
942 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
946 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
947 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
948 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
949 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
964 if (Subtarget->hasBF16ConversionInsts()) {
969 if (Subtarget->hasBF16PackedInsts()) {
975 if (Subtarget->hasBF16TransInsts()) {
979 if (Subtarget->hasCvtPkF16F32Inst()) {
981 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1031 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1040 ISD::ATOMIC_CMP_SWAP,
1041 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1043 ISD::ATOMIC_LOAD_ADD,
1044 ISD::ATOMIC_LOAD_SUB,
1045 ISD::ATOMIC_LOAD_AND,
1046 ISD::ATOMIC_LOAD_OR,
1047 ISD::ATOMIC_LOAD_XOR,
1048 ISD::ATOMIC_LOAD_NAND,
1049 ISD::ATOMIC_LOAD_MIN,
1050 ISD::ATOMIC_LOAD_MAX,
1051 ISD::ATOMIC_LOAD_UMIN,
1052 ISD::ATOMIC_LOAD_UMAX,
1053 ISD::ATOMIC_LOAD_FADD,
1054 ISD::ATOMIC_LOAD_FMIN,
1055 ISD::ATOMIC_LOAD_FMAX,
1056 ISD::ATOMIC_LOAD_UINC_WRAP,
1057 ISD::ATOMIC_LOAD_UDEC_WRAP,
1070 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1083 EVT DestVT,
EVT SrcVT)
const {
1085 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1086 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1088 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1095 LLT DestTy,
LLT SrcTy)
const {
1096 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1097 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1099 SrcTy.getScalarSizeInBits() == 16 &&
1120 if (Subtarget->has16BitInsts()) {
1123 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1125 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1129 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1151 if (
Size == 16 && Subtarget->has16BitInsts())
1152 return (NumElts + 1) / 2;
1158 return NumElts * ((
Size + 31) / 32);
1167 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1175 if (
Size == 16 && Subtarget->has16BitInsts()) {
1176 if (ScalarVT == MVT::bf16) {
1177 RegisterVT = MVT::i32;
1178 IntermediateVT = MVT::v2bf16;
1180 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1181 IntermediateVT = RegisterVT;
1183 NumIntermediates = (NumElts + 1) / 2;
1184 return NumIntermediates;
1189 IntermediateVT = RegisterVT;
1190 NumIntermediates = NumElts;
1191 return NumIntermediates;
1196 RegisterVT = MVT::i16;
1197 IntermediateVT = ScalarVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1203 RegisterVT = MVT::i32;
1204 IntermediateVT = ScalarVT;
1205 NumIntermediates = NumElts;
1206 return NumIntermediates;
1210 RegisterVT = MVT::i32;
1211 IntermediateVT = RegisterVT;
1212 NumIntermediates = NumElts * ((
Size + 31) / 32);
1213 return NumIntermediates;
1218 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1223 unsigned MaxNumLanes) {
1224 assert(MaxNumLanes != 0);
1228 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1239 unsigned MaxNumLanes) {
1245 assert(ST->getNumContainedTypes() == 2 &&
1246 ST->getContainedType(1)->isIntegerTy(32));
1260 return MVT::amdgpuBufferFatPointer;
1262 DL.getPointerSizeInBits(AS) == 192)
1263 return MVT::amdgpuBufferStridedPointer;
1272 DL.getPointerSizeInBits(AS) == 160) ||
1274 DL.getPointerSizeInBits(AS) == 192))
1281 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1282 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1283 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1285 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1286 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1287 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1288 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1289 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1291 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1292 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1293 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1294 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1295 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1297 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1298 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1299 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1300 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1301 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1340 unsigned IntrID)
const {
1342 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1360 if (RsrcIntr->IsImage) {
1375 Info.ptrVal = RsrcArg;
1378 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1387 if (RsrcIntr->IsImage) {
1388 unsigned MaxNumLanes = 4;
1403 std::numeric_limits<unsigned>::max());
1413 if (RsrcIntr->IsImage) {
1434 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1436 Info.memVT = MVT::i32;
1443 case Intrinsic::amdgcn_raw_buffer_load_lds:
1444 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1445 case Intrinsic::amdgcn_struct_buffer_load_lds:
1446 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1452 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1453 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1454 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1455 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1458 std::numeric_limits<unsigned>::max());
1468 case Intrinsic::amdgcn_ds_ordered_add:
1469 case Intrinsic::amdgcn_ds_ordered_swap: {
1482 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1483 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1486 Info.ptrVal =
nullptr;
1491 case Intrinsic::amdgcn_ds_append:
1492 case Intrinsic::amdgcn_ds_consume: {
1505 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1506 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1507 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1512 Info.memVT = MVT::i64;
1518 case Intrinsic::amdgcn_global_atomic_csub: {
1527 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1528 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1529 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1532 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1535 ->getElementType(0));
1543 case Intrinsic::amdgcn_global_atomic_fmin_num:
1544 case Intrinsic::amdgcn_global_atomic_fmax_num:
1545 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1546 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1547 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1548 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1558 case Intrinsic::amdgcn_flat_load_monitor_b32:
1559 case Intrinsic::amdgcn_flat_load_monitor_b64:
1560 case Intrinsic::amdgcn_flat_load_monitor_b128:
1561 case Intrinsic::amdgcn_global_load_monitor_b32:
1562 case Intrinsic::amdgcn_global_load_monitor_b64:
1563 case Intrinsic::amdgcn_global_load_monitor_b128:
1564 case Intrinsic::amdgcn_cluster_load_b32:
1565 case Intrinsic::amdgcn_cluster_load_b64:
1566 case Intrinsic::amdgcn_cluster_load_b128:
1567 case Intrinsic::amdgcn_ds_load_tr6_b96:
1568 case Intrinsic::amdgcn_ds_load_tr4_b64:
1569 case Intrinsic::amdgcn_ds_load_tr8_b64:
1570 case Intrinsic::amdgcn_ds_load_tr16_b128:
1571 case Intrinsic::amdgcn_global_load_tr6_b96:
1572 case Intrinsic::amdgcn_global_load_tr4_b64:
1573 case Intrinsic::amdgcn_global_load_tr_b64:
1574 case Intrinsic::amdgcn_global_load_tr_b128:
1575 case Intrinsic::amdgcn_ds_read_tr4_b64:
1576 case Intrinsic::amdgcn_ds_read_tr6_b96:
1577 case Intrinsic::amdgcn_ds_read_tr8_b64:
1578 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1586 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1587 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1588 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1596 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1597 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1598 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1606 case Intrinsic::amdgcn_ds_gws_init:
1607 case Intrinsic::amdgcn_ds_gws_barrier:
1608 case Intrinsic::amdgcn_ds_gws_sema_v:
1609 case Intrinsic::amdgcn_ds_gws_sema_br:
1610 case Intrinsic::amdgcn_ds_gws_sema_p:
1611 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1621 Info.memVT = MVT::i32;
1623 Info.align =
Align(4);
1625 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1631 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1632 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1633 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1634 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1635 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1636 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1637 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1638 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1645 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1646 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1647 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1648 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1655 case Intrinsic::amdgcn_load_to_lds:
1656 case Intrinsic::amdgcn_global_load_lds: {
1664 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1665 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1666 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1667 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1677 Info.memVT = MVT::i32;
1679 Info.align =
Align(4);
1684 case Intrinsic::amdgcn_s_prefetch_data:
1685 case Intrinsic::amdgcn_flat_prefetch:
1686 case Intrinsic::amdgcn_global_prefetch: {
1701 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1704 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1705 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1717 Type *&AccessTy)
const {
1719 switch (
II->getIntrinsicID()) {
1720 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1721 case Intrinsic::amdgcn_cluster_load_b128:
1722 case Intrinsic::amdgcn_cluster_load_b64:
1723 case Intrinsic::amdgcn_cluster_load_b32:
1724 case Intrinsic::amdgcn_ds_append:
1725 case Intrinsic::amdgcn_ds_consume:
1726 case Intrinsic::amdgcn_ds_load_tr8_b64:
1727 case Intrinsic::amdgcn_ds_load_tr16_b128:
1728 case Intrinsic::amdgcn_ds_load_tr4_b64:
1729 case Intrinsic::amdgcn_ds_load_tr6_b96:
1730 case Intrinsic::amdgcn_ds_read_tr4_b64:
1731 case Intrinsic::amdgcn_ds_read_tr6_b96:
1732 case Intrinsic::amdgcn_ds_read_tr8_b64:
1733 case Intrinsic::amdgcn_ds_read_tr16_b64:
1734 case Intrinsic::amdgcn_ds_ordered_add:
1735 case Intrinsic::amdgcn_ds_ordered_swap:
1736 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1737 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1738 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1739 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1740 case Intrinsic::amdgcn_flat_load_monitor_b128:
1741 case Intrinsic::amdgcn_flat_load_monitor_b32:
1742 case Intrinsic::amdgcn_flat_load_monitor_b64:
1743 case Intrinsic::amdgcn_global_atomic_csub:
1744 case Intrinsic::amdgcn_global_atomic_fmax_num:
1745 case Intrinsic::amdgcn_global_atomic_fmin_num:
1746 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1747 case Intrinsic::amdgcn_global_load_monitor_b128:
1748 case Intrinsic::amdgcn_global_load_monitor_b32:
1749 case Intrinsic::amdgcn_global_load_monitor_b64:
1750 case Intrinsic::amdgcn_global_load_tr_b64:
1751 case Intrinsic::amdgcn_global_load_tr_b128:
1752 case Intrinsic::amdgcn_global_load_tr4_b64:
1753 case Intrinsic::amdgcn_global_load_tr6_b96:
1754 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1755 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1756 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1757 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1758 Ptr =
II->getArgOperand(0);
1760 case Intrinsic::amdgcn_load_to_lds:
1761 case Intrinsic::amdgcn_global_load_lds:
1762 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1763 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1764 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1765 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1766 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1767 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1768 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1769 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1770 Ptr =
II->getArgOperand(1);
1775 AccessTy =
II->getType();
1781 unsigned AddrSpace)
const {
1782 if (!Subtarget->hasFlatInstOffsets()) {
1793 return AM.
Scale == 0 &&
1794 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1795 AM.
BaseOffs, AddrSpace, FlatVariant));
1799 if (Subtarget->hasFlatGlobalInsts())
1802 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1815 return isLegalMUBUFAddressingMode(AM);
1818bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1829 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1841 if (AM.HasBaseReg) {
1873 return isLegalMUBUFAddressingMode(AM);
1875 if (!Subtarget->hasScalarSubwordLoads()) {
1880 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1928 return Subtarget->enableFlatScratch()
1930 : isLegalMUBUFAddressingMode(AM);
1977 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1986 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1989 Align RequiredAlignment(
1991 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1992 Alignment < RequiredAlignment)
2007 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2013 RequiredAlignment =
Align(4);
2015 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2031 *IsFast = (Alignment >= RequiredAlignment) ? 64
2032 : (Alignment <
Align(4)) ? 32
2039 if (!Subtarget->hasDS96AndDS128())
2045 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2054 *IsFast = (Alignment >= RequiredAlignment) ? 96
2055 : (Alignment <
Align(4)) ? 32
2062 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2068 RequiredAlignment =
Align(8);
2070 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2079 *IsFast = (Alignment >= RequiredAlignment) ? 128
2080 : (Alignment <
Align(4)) ? 32
2097 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2099 return Alignment >= RequiredAlignment ||
2100 Subtarget->hasUnalignedDSAccessEnabled();
2108 bool AlignedBy4 = Alignment >=
Align(4);
2109 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2111 *IsFast = AlignedBy4 ?
Size : 1;
2116 *IsFast = AlignedBy4;
2127 return Alignment >=
Align(4) ||
2128 Subtarget->hasUnalignedBufferAccessEnabled();
2140 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2155 return Size >= 32 && Alignment >=
Align(4);
2160 unsigned *IsFast)
const {
2162 Alignment, Flags, IsFast);
2167 const AttributeList &FuncAttributes)
const {
2173 if (
Op.size() >= 16 &&
2177 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2195 unsigned DestAS)
const {
2198 Subtarget->hasGloballyAddressableScratch()) {
2228 unsigned Index)
const {
2244 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2271 auto [InputPtrReg, RC, ArgTy] =
2281 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2287 const SDLoc &SL)
const {
2294 const SDLoc &SL)
const {
2297 std::optional<uint32_t> KnownSize =
2299 if (KnownSize.has_value())
2325 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2334SDValue SITargetLowering::lowerKernargMemParameter(
2346 int64_t OffsetDiff =
Offset - AlignDownOffset;
2352 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2361 ArgVal = DAG.
getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2362 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2372 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2381 const SDLoc &SL)
const {
2391 return DAG.
getNode(ISD::BITCAST, SL, ValVT, Val);
2450 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2453 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2454 if (ConvertedVal == ArgValue)
2455 return ConvertedVal;
2460SDValue SITargetLowering::lowerWorkGroupId(
2465 if (!Subtarget->hasClusters())
2466 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2474 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2475 SDLoc SL(ClusterIdXYZ);
2476 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2479 SDValue ClusterWorkGroupIdXYZ =
2480 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2490 return ClusterIdXYZ;
2492 using namespace AMDGPU::Hwreg;
2496 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2507SDValue SITargetLowering::getPreloadedValue(
2510 const ArgDescriptor *
Reg =
nullptr;
2511 const TargetRegisterClass *RC;
2515 const ArgDescriptor WorkGroupIDX =
2523 const ArgDescriptor WorkGroupIDZ =
2525 const ArgDescriptor ClusterWorkGroupIDX =
2527 const ArgDescriptor ClusterWorkGroupIDY =
2529 const ArgDescriptor ClusterWorkGroupIDZ =
2531 const ArgDescriptor ClusterWorkGroupMaxIDX =
2533 const ArgDescriptor ClusterWorkGroupMaxIDY =
2535 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2537 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2540 auto LoadConstant = [&](
unsigned N) {
2544 if (Subtarget->hasArchitectedSGPRs() &&
2551 Reg = &WorkGroupIDX;
2552 RC = &AMDGPU::SReg_32RegClass;
2556 Reg = &WorkGroupIDY;
2557 RC = &AMDGPU::SReg_32RegClass;
2561 Reg = &WorkGroupIDZ;
2562 RC = &AMDGPU::SReg_32RegClass;
2566 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDX;
2569 RC = &AMDGPU::SReg_32RegClass;
2573 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2574 return LoadConstant(0);
2575 Reg = &ClusterWorkGroupIDY;
2576 RC = &AMDGPU::SReg_32RegClass;
2580 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2581 return LoadConstant(0);
2582 Reg = &ClusterWorkGroupIDZ;
2583 RC = &AMDGPU::SReg_32RegClass;
2588 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2589 Reg = &ClusterWorkGroupMaxIDX;
2590 RC = &AMDGPU::SReg_32RegClass;
2595 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2596 Reg = &ClusterWorkGroupMaxIDY;
2597 RC = &AMDGPU::SReg_32RegClass;
2602 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2603 Reg = &ClusterWorkGroupMaxIDZ;
2604 RC = &AMDGPU::SReg_32RegClass;
2608 Reg = &ClusterWorkGroupMaxFlatID;
2609 RC = &AMDGPU::SReg_32RegClass;
2640 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2644 "vector type argument should have been split");
2649 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2657 "unexpected vector split in ps argument type");
2671 Info->markPSInputAllocated(PSInputNum);
2673 Info->markPSInputEnabled(PSInputNum);
2689 if (Info.hasWorkItemIDX()) {
2695 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2699 if (Info.hasWorkItemIDY()) {
2700 assert(Info.hasWorkItemIDX());
2701 if (Subtarget->hasPackedTID()) {
2702 Info.setWorkItemIDY(
2705 unsigned Reg = AMDGPU::VGPR1;
2713 if (Info.hasWorkItemIDZ()) {
2714 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2715 if (Subtarget->hasPackedTID()) {
2716 Info.setWorkItemIDZ(
2719 unsigned Reg = AMDGPU::VGPR2;
2739 if (RegIdx == ArgVGPRs.
size()) {
2746 unsigned Reg = ArgVGPRs[RegIdx];
2758 unsigned NumArgRegs) {
2761 if (RegIdx == ArgSGPRs.
size())
2764 unsigned Reg = ArgSGPRs[RegIdx];
2806 const unsigned Mask = 0x3ff;
2809 if (Info.hasWorkItemIDX()) {
2811 Info.setWorkItemIDX(Arg);
2814 if (Info.hasWorkItemIDY()) {
2816 Info.setWorkItemIDY(Arg);
2819 if (Info.hasWorkItemIDZ())
2831 const unsigned Mask = 0x3ff;
2840 auto &
ArgInfo = Info.getArgInfo();
2852 if (Info.hasImplicitArgPtr())
2860 if (Info.hasWorkGroupIDX())
2863 if (Info.hasWorkGroupIDY())
2866 if (Info.hasWorkGroupIDZ())
2869 if (Info.hasLDSKernelId())
2880 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2881 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2887 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2888 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2893 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2894 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2900 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2906 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2915 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2920 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2921 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2926 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2927 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2942 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2944 bool InPreloadSequence =
true;
2946 bool AlignedForImplictArgs =
false;
2947 unsigned ImplicitArgOffset = 0;
2948 for (
auto &Arg :
F.args()) {
2949 if (!InPreloadSequence || !Arg.hasInRegAttr())
2952 unsigned ArgIdx = Arg.getArgNo();
2955 if (InIdx < Ins.size() &&
2956 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2959 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2960 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2962 assert(ArgLocs[ArgIdx].isMemLoc());
2963 auto &ArgLoc = ArgLocs[InIdx];
2965 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2967 unsigned NumAllocSGPRs =
2968 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2971 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2972 if (!AlignedForImplictArgs) {
2974 alignTo(LastExplicitArgOffset,
2975 Subtarget->getAlignmentForImplicitArgPtr()) -
2976 LastExplicitArgOffset;
2977 AlignedForImplictArgs =
true;
2979 ArgOffset += ImplicitArgOffset;
2983 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2984 assert(InIdx >= 1 &&
"No previous SGPR");
2985 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2986 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2990 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2991 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2994 InPreloadSequence =
false;
3000 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3002 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3004 if (PreloadRegs->
size() > 1)
3005 RC = &AMDGPU::SGPR_32RegClass;
3006 for (
auto &Reg : *PreloadRegs) {
3012 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3021 if (Info.hasLDSKernelId()) {
3022 Register Reg = Info.addLDSKernelId();
3023 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3032 bool IsShader)
const {
3033 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3034 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3040 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3042 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3046 unsigned NumRequiredSystemSGPRs =
3047 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3048 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3049 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3050 Register Reg = Info.addReservedUserSGPR();
3051 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3056 if (!HasArchitectedSGPRs) {
3057 if (Info.hasWorkGroupIDX()) {
3058 Register Reg = Info.addWorkGroupIDX();
3059 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3063 if (Info.hasWorkGroupIDY()) {
3064 Register Reg = Info.addWorkGroupIDY();
3065 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3069 if (Info.hasWorkGroupIDZ()) {
3070 Register Reg = Info.addWorkGroupIDZ();
3071 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3076 if (Info.hasWorkGroupInfo()) {
3077 Register Reg = Info.addWorkGroupInfo();
3078 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3082 if (Info.hasPrivateSegmentWaveByteOffset()) {
3084 unsigned PrivateSegmentWaveByteOffsetReg;
3087 PrivateSegmentWaveByteOffsetReg =
3088 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3092 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3094 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3097 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3099 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3100 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3103 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3104 Info.getNumPreloadedSGPRs() >= 16);
3119 if (HasStackObjects)
3120 Info.setHasNonSpillStackObjects(
true);
3125 HasStackObjects =
true;
3129 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3131 if (!ST.enableFlatScratch()) {
3132 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3139 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3141 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3151 Info.setScratchRSrcReg(ReservedBufferReg);
3170 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3171 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3178 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3179 if (!
MRI.isLiveIn(
Reg)) {
3180 Info.setStackPtrOffsetReg(
Reg);
3185 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3192 if (ST.getFrameLowering()->hasFP(MF)) {
3193 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3209 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3218 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3219 RC = &AMDGPU::SGPR_64RegClass;
3220 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3221 RC = &AMDGPU::SGPR_32RegClass;
3227 Entry->addLiveIn(*
I);
3232 for (
auto *Exit : Exits)
3234 TII->get(TargetOpcode::COPY), *
I)
3249 bool IsError =
false;
3253 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3271 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3272 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3274 if (!Subtarget->enableFlatScratch())
3279 !Subtarget->hasArchitectedSGPRs())
3280 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3281 !Info->hasWorkGroupIDZ());
3284 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3302 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3303 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3306 Info->markPSInputAllocated(0);
3307 Info->markPSInputEnabled(0);
3309 if (Subtarget->isAmdPalOS()) {
3318 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3319 if ((PsInputBits & 0x7F) == 0 ||
3320 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3323 }
else if (IsKernel) {
3324 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3326 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3336 if (IsKernel && Subtarget->hasKernargPreload())
3340 }
else if (!IsGraphics) {
3345 if (!Subtarget->enableFlatScratch())
3357 Info->setNumWaveDispatchSGPRs(
3359 Info->setNumWaveDispatchVGPRs(
3361 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3362 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3367 if (IsWholeWaveFunc) {
3369 {MVT::i1, MVT::Other}, Chain);
3381 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3392 if (IsEntryFunc && VA.
isMemLoc()) {
3415 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3419 int64_t OffsetDiff =
Offset - AlignDownOffset;
3426 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3436 ArgVal = DAG.
getNode(ISD::BITCAST,
DL, MemVT, ArgVal);
3437 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3438 Ins[i].Flags.isSExt(), &Ins[i]);
3446 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3449 if (PreloadRegs.
size() == 1) {
3450 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3455 TRI->getRegSizeInBits(*RC)));
3463 for (
auto Reg : PreloadRegs) {
3470 PreloadRegs.size()),
3487 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3488 Ins[i].Flags.isSExt(), &Ins[i]);
3500 "hidden argument in kernel signature was not preloaded",
3506 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3507 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3527 if (!IsEntryFunc && VA.
isMemLoc()) {
3528 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3539 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3540 RC = &AMDGPU::VGPR_32RegClass;
3541 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3542 RC = &AMDGPU::SGPR_32RegClass;
3562 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3578 Info->setBytesInStackArgArea(StackArgSize);
3580 return Chains.
empty() ? Chain
3589 const Type *RetTy)
const {
3597 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3602 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3603 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3604 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3605 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3628 Info->setIfReturnsVoid(Outs.
empty());
3629 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3648 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3649 ++
I, ++RealRVLocIdx) {
3653 SDValue Arg = OutVals[RealRVLocIdx];
3676 ReadFirstLane, Arg);
3683 if (!Info->isEntryFunction()) {
3689 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3691 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3792 auto &ArgUsageInfo =
3794 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3822 const auto [OutgoingArg, ArgRC, ArgTy] =
3827 const auto [IncomingArg, IncomingArgRC, Ty] =
3829 assert(IncomingArgRC == ArgRC);
3832 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3840 InputReg = getImplicitArgPtr(DAG,
DL);
3842 std::optional<uint32_t> Id =
3844 if (Id.has_value()) {
3855 if (OutgoingArg->isRegister()) {
3856 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3857 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3860 unsigned SpecialArgOffset =
3871 auto [OutgoingArg, ArgRC, Ty] =
3874 std::tie(OutgoingArg, ArgRC, Ty) =
3877 std::tie(OutgoingArg, ArgRC, Ty) =
3892 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3893 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3894 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3899 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3907 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3917 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3926 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3927 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3938 : IncomingArgY ? *IncomingArgY
3945 if (OutgoingArg->isRegister()) {
3947 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3973 if (Callee->isDivergent())
3980 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3984 if (!CallerPreserved)
3987 bool CCMatch = CallerCC == CalleeCC;
4000 if (Arg.hasByValAttr())
4014 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4015 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4024 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4037 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4039 if (!CCVA.isRegLoc())
4044 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4046 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4070enum ChainCallArgIdx {
4092 bool UsesDynamicVGPRs =
false;
4093 if (IsChainCallConv) {
4098 auto RequestedExecIt =
4100 return Arg.OrigArgIndex == 2;
4102 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4104 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4107 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4110 "Haven't popped all the special args");
4113 CLI.
Args[ChainCallArgIdx::Exec];
4114 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4122 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4124 ChainCallSpecialArgs.
push_back(Arg.Node);
4127 PushNodeOrTargetConstant(RequestedExecArg);
4133 if (FlagsValue.
isZero()) {
4134 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4136 "no additional args allowed if flags == 0");
4138 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4142 if (!Subtarget->isWave32()) {
4144 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4147 UsesDynamicVGPRs =
true;
4148 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4149 CLI.
Args.end(), PushNodeOrTargetConstant);
4158 bool IsSibCall =
false;
4172 "unsupported call to variadic function ");
4180 "unsupported required tail call to function ");
4185 Outs, OutVals, Ins, DAG);
4189 "site marked musttail or on llvm.amdgcn.cs.chain");
4196 if (!TailCallOpt && IsTailCall)
4236 auto *
TRI = Subtarget->getRegisterInfo();
4243 if (!IsSibCall || IsChainCallConv) {
4244 if (!Subtarget->enableFlatScratch()) {
4250 RegsToPass.emplace_back(IsChainCallConv
4251 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4252 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4259 const unsigned NumSpecialInputs = RegsToPass.size();
4261 MVT PtrVT = MVT::i32;
4264 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4292 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4300 int32_t
Offset = LocMemOffset;
4307 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4313 ? Flags.getNonZeroByValAlign()
4340 if (Outs[i].Flags.isByVal()) {
4342 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4345 Outs[i].Flags.getNonZeroByValAlign(),
4347 nullptr, std::nullopt, DstInfo,
4353 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4359 if (!MemOpChains.
empty())
4367 TokenGlue = DAG.
getNode(ISD::CONVERGENCECTRL_GLUE,
DL, MVT::Glue,
4375 unsigned ArgIdx = 0;
4376 for (
auto [Reg, Val] : RegsToPass) {
4377 if (ArgIdx++ >= NumSpecialInputs &&
4378 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4404 if (IsTailCall && !IsSibCall) {
4409 std::vector<SDValue>
Ops({Chain});
4415 Ops.push_back(Callee);
4432 Ops.push_back(Callee);
4443 if (IsChainCallConv)
4448 for (
auto &[Reg, Val] : RegsToPass)
4452 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4453 assert(Mask &&
"Missing call preserved mask for calling convention");
4463 MVT::Glue, GlueOps),
4468 Ops.push_back(InGlue);
4488 if (Info->isWholeWaveFunction())
4496 Chain =
Call.getValue(0);
4497 InGlue =
Call.getValue(1);
4499 uint64_t CalleePopBytes = NumBytes;
4520 EVT VT =
Op.getValueType();
4534 "Stack grows upwards for AMDGPU");
4536 Chain = BaseAddr.getValue(1);
4538 if (Alignment > StackAlign) {
4540 << Subtarget->getWavefrontSizeLog2();
4541 uint64_t StackAlignMask = ScaledAlignment - 1;
4548 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4554 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4565 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4581 if (
Op.getValueType() != MVT::i32)
4600 assert(
Op.getValueType() == MVT::i32);
4609 Op.getOperand(0), IntrinID, GetRoundBothImm);
4643 SDValue RoundModeTimesNumBits =
4663 TableEntry, EnumOffset);
4679 static_cast<uint32_t>(ConstMode->getZExtValue()),
4691 if (UseReducedTable) {
4697 SDValue RoundModeTimesNumBits =
4717 SDValue RoundModeTimesNumBits =
4726 NewMode = TruncTable;
4735 ReadFirstLaneID, NewMode);
4748 IntrinID, RoundBothImm, NewMode);
4754 if (
Op->isDivergent() &&
4755 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4765 if (Subtarget->hasSafeSmemPrefetch())
4773 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4782 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4783 EVT SrcVT = Src.getValueType();
4792 EVT DstVT =
Op.getValueType();
4796 return DAG.
getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4801 if (
Op.getValueType() != MVT::i64)
4815 Op.getOperand(0), IntrinID, ModeHwRegImm);
4817 Op.getOperand(0), IntrinID, TrapHwRegImm);
4824 SDValue Result = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4831 if (
Op.getOperand(1).getValueType() != MVT::i64)
4843 ReadFirstLaneID, NewModeReg);
4845 ReadFirstLaneID, NewTrapReg);
4847 unsigned ModeHwReg =
4850 unsigned TrapHwReg =
4858 IntrinID, ModeHwRegImm, NewModeReg);
4861 IntrinID, TrapHwRegImm, NewTrapReg);
4870 .
Case(
"m0", AMDGPU::M0)
4871 .
Case(
"exec", AMDGPU::EXEC)
4872 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4873 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4874 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4875 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4876 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4881 if (!Subtarget->hasFlatScrRegister() &&
4882 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4884 "\" for subtarget."));
4889 case AMDGPU::EXEC_LO:
4890 case AMDGPU::EXEC_HI:
4891 case AMDGPU::FLAT_SCR_LO:
4892 case AMDGPU::FLAT_SCR_HI:
4897 case AMDGPU::FLAT_SCR:
4916 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4925static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4947 auto Next = std::next(
I);
4958 MBB.addSuccessor(LoopBB);
4960 return std::pair(LoopBB, RemainderBB);
4967 auto I =
MI.getIterator();
4968 auto E = std::next(
I);
4990 Src->setIsKill(
false);
5000 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5006 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5009 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5033 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5034 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5044 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5045 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5047 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5048 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5056 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5063 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5067 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5075 MRI.setSimpleHint(NewExec, CondReg);
5077 if (UseGPRIdxMode) {
5079 SGPRIdxReg = CurrentIdxReg;
5081 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5082 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5092 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5123 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5124 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5132 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5134 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5135 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5151 InitResultReg, DstReg, PhiReg, TmpExec,
5152 Offset, UseGPRIdxMode, SGPRIdxReg);
5158 LoopBB->removeSuccessor(RemainderBB);
5160 LoopBB->addSuccessor(LandingPad);
5171static std::pair<unsigned, int>
5175 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5180 return std::pair(AMDGPU::sub0,
Offset);
5220 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5237 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5238 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5247 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5250 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5254 if (UseGPRIdxMode) {
5261 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5274 MI.eraseFromParent();
5283 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5284 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5290 UseGPRIdxMode, SGPRIdxReg);
5294 if (UseGPRIdxMode) {
5296 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5298 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5303 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5308 MI.eraseFromParent();
5325 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5335 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5337 if (Idx->
getReg() == AMDGPU::NoRegister) {
5348 MI.eraseFromParent();
5353 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5357 if (UseGPRIdxMode) {
5361 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5370 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5371 TRI.getRegSizeInBits(*VecRC), 32,
false);
5377 MI.eraseFromParent();
5387 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5391 UseGPRIdxMode, SGPRIdxReg);
5394 if (UseGPRIdxMode) {
5396 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5398 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5404 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5405 TRI.getRegSizeInBits(*VecRC), 32,
false);
5406 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5412 MI.eraseFromParent();
5428 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5429 if (ST.hasScalarAddSub64()) {
5430 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5440 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5441 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5444 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5446 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5449 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5451 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5453 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5454 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5463 MI.eraseFromParent();
5469 case AMDGPU::S_MIN_U32:
5470 return std::numeric_limits<uint32_t>::max();
5471 case AMDGPU::S_MIN_I32:
5472 return std::numeric_limits<int32_t>::max();
5473 case AMDGPU::S_MAX_U32:
5474 return std::numeric_limits<uint32_t>::min();
5475 case AMDGPU::S_MAX_I32:
5476 return std::numeric_limits<int32_t>::min();
5477 case AMDGPU::S_ADD_I32:
5478 case AMDGPU::S_SUB_I32:
5479 case AMDGPU::S_OR_B32:
5480 case AMDGPU::S_XOR_B32:
5481 return std::numeric_limits<uint32_t>::min();
5482 case AMDGPU::S_AND_B32:
5483 return std::numeric_limits<uint32_t>::max();
5486 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5492 case AMDGPU::V_CMP_LT_U64_e64:
5493 return std::numeric_limits<uint64_t>::max();
5494 case AMDGPU::V_CMP_LT_I64_e64:
5495 return std::numeric_limits<int64_t>::max();
5496 case AMDGPU::V_CMP_GT_U64_e64:
5497 return std::numeric_limits<uint64_t>::min();
5498 case AMDGPU::V_CMP_GT_I64_e64:
5499 return std::numeric_limits<int64_t>::min();
5500 case AMDGPU::S_ADD_U64_PSEUDO:
5501 case AMDGPU::S_SUB_U64_PSEUDO:
5502 case AMDGPU::S_OR_B64:
5503 case AMDGPU::S_XOR_B64:
5504 return std::numeric_limits<uint64_t>::min();
5505 case AMDGPU::S_AND_B64:
5506 return std::numeric_limits<uint64_t>::max();
5509 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5514 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5515 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5516 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5517 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5518 Opc == AMDGPU::S_XOR_B32;
5532 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5537 case AMDGPU::S_MIN_U32:
5538 case AMDGPU::S_MIN_I32:
5539 case AMDGPU::S_MAX_U32:
5540 case AMDGPU::S_MAX_I32:
5541 case AMDGPU::S_AND_B32:
5542 case AMDGPU::S_OR_B32: {
5548 case AMDGPU::V_CMP_LT_U64_e64:
5549 case AMDGPU::V_CMP_LT_I64_e64:
5550 case AMDGPU::V_CMP_GT_U64_e64:
5551 case AMDGPU::V_CMP_GT_I64_e64:
5552 case AMDGPU::S_AND_B64:
5553 case AMDGPU::S_OR_B64: {
5559 case AMDGPU::S_XOR_B32:
5560 case AMDGPU::S_XOR_B64:
5561 case AMDGPU::S_ADD_I32:
5562 case AMDGPU::S_ADD_U64_PSEUDO:
5563 case AMDGPU::S_SUB_I32:
5564 case AMDGPU::S_SUB_U64_PSEUDO: {
5567 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5569 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5571 bool IsWave32 = ST.isWave32();
5572 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5573 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5574 unsigned BitCountOpc =
5575 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5579 auto NewAccumulator =
5584 case AMDGPU::S_XOR_B32:
5585 case AMDGPU::S_XOR_B64: {
5591 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5594 .
addReg(NewAccumulator->getOperand(0).getReg())
5597 if (
Opc == AMDGPU::S_XOR_B32) {
5603 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5605 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5609 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5612 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5614 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5624 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5632 case AMDGPU::S_SUB_I32: {
5633 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5641 .
addReg(NewAccumulator->getOperand(0).getReg());
5644 case AMDGPU::S_ADD_I32: {
5647 .
addReg(NewAccumulator->getOperand(0).getReg());
5650 case AMDGPU::S_ADD_U64_PSEUDO:
5651 case AMDGPU::S_SUB_U64_PSEUDO: {
5652 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5653 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5655 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5657 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5658 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5659 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5661 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5663 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5667 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5670 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5672 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5674 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5677 .
addReg(NewAccumulator->getOperand(0).getReg())
5687 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5689 : NewAccumulator->getOperand(0).getReg();
5700 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5706 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5712 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5744 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5745 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5746 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5747 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5748 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5749 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5750 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5752 bool IsWave32 = ST.isWave32();
5753 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5754 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5761 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5765 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5774 I = ComputeLoop->begin();
5776 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5780 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5784 I = ComputeLoop->end();
5787 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5791 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5800 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5802 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5803 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5806 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5808 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5810 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5812 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5816 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5820 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5821 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5827 case AMDGPU::S_OR_B64:
5828 case AMDGPU::S_AND_B64:
5829 case AMDGPU::S_XOR_B64: {
5832 .
addReg(LaneValue->getOperand(0).getReg())
5836 case AMDGPU::V_CMP_GT_I64_e64:
5837 case AMDGPU::V_CMP_GT_U64_e64:
5838 case AMDGPU::V_CMP_LT_I64_e64:
5839 case AMDGPU::V_CMP_LT_U64_e64: {
5840 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5842 MRI.createVirtualRegister(WaveMaskRegClass);
5845 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5846 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5849 VregClass, AMDGPU::sub0, VSubRegClass);
5852 VregClass, AMDGPU::sub1, VSubRegClass);
5853 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5860 .
addReg(LaneValue->getOperand(0).getReg())
5861 .
addReg(AccumulatorVReg);
5863 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5864 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5868 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5869 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5870 .
addReg(LaneValue->getOperand(0).getReg())
5874 case AMDGPU::S_ADD_U64_PSEUDO:
5875 case AMDGPU::S_SUB_U64_PSEUDO: {
5878 .
addReg(LaneValue->getOperand(0).getReg());
5885 unsigned BITSETOpc =
5886 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5887 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5893 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5896 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5898 .
addReg(NewActiveBitsReg)
5900 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5905 MI.eraseFromParent();
5920 switch (
MI.getOpcode()) {
5921 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5923 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5925 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5927 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5929 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5931 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5933 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5935 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5937 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5939 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5941 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5943 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5945 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5947 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5949 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5951 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5953 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5955 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5957 case AMDGPU::S_UADDO_PSEUDO:
5958 case AMDGPU::S_USUBO_PSEUDO: {
5964 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5966 : AMDGPU::S_SUB_U32;
5974 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5977 MI.eraseFromParent();
5980 case AMDGPU::S_ADD_U64_PSEUDO:
5981 case AMDGPU::S_SUB_U64_PSEUDO: {
5984 case AMDGPU::V_ADD_U64_PSEUDO:
5985 case AMDGPU::V_SUB_U64_PSEUDO: {
5986 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5992 if (ST.hasAddSubU64Insts()) {
5994 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5995 : AMDGPU::V_SUB_U64_e64),
6000 TII->legalizeOperands(*
I);
6001 MI.eraseFromParent();
6005 if (IsAdd && ST.hasLshlAddU64Inst()) {
6011 TII->legalizeOperands(*
Add);
6012 MI.eraseFromParent();
6016 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6018 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6019 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6021 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6022 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6026 : &AMDGPU::VReg_64RegClass;
6029 : &AMDGPU::VReg_64RegClass;
6032 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6034 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6037 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6039 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6042 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6044 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6047 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6054 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6068 TII->legalizeOperands(*LoHalf);
6069 TII->legalizeOperands(*HiHalf);
6070 MI.eraseFromParent();
6073 case AMDGPU::S_ADD_CO_PSEUDO:
6074 case AMDGPU::S_SUB_CO_PSEUDO: {
6084 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6085 ? AMDGPU::S_ADDC_U32
6086 : AMDGPU::S_SUBB_U32;
6088 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6089 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6094 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6095 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6099 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6101 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6106 if (ST.isWave64()) {
6107 if (ST.hasScalarCompareEq64()) {
6114 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6116 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6118 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6119 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6121 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6142 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6148 MI.eraseFromParent();
6151 case AMDGPU::SI_INIT_M0: {
6154 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6157 MI.eraseFromParent();
6160 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6163 TII->get(AMDGPU::S_CMP_EQ_U32))
6168 case AMDGPU::GET_GROUPSTATICSIZE: {
6172 .
add(
MI.getOperand(0))
6174 MI.eraseFromParent();
6177 case AMDGPU::GET_SHADERCYCLESHILO: {
6190 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6192 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6193 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6195 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6196 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6198 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6202 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6207 .
add(
MI.getOperand(0))
6212 MI.eraseFromParent();
6215 case AMDGPU::SI_INDIRECT_SRC_V1:
6216 case AMDGPU::SI_INDIRECT_SRC_V2:
6217 case AMDGPU::SI_INDIRECT_SRC_V4:
6218 case AMDGPU::SI_INDIRECT_SRC_V8:
6219 case AMDGPU::SI_INDIRECT_SRC_V9:
6220 case AMDGPU::SI_INDIRECT_SRC_V10:
6221 case AMDGPU::SI_INDIRECT_SRC_V11:
6222 case AMDGPU::SI_INDIRECT_SRC_V12:
6223 case AMDGPU::SI_INDIRECT_SRC_V16:
6224 case AMDGPU::SI_INDIRECT_SRC_V32:
6226 case AMDGPU::SI_INDIRECT_DST_V1:
6227 case AMDGPU::SI_INDIRECT_DST_V2:
6228 case AMDGPU::SI_INDIRECT_DST_V4:
6229 case AMDGPU::SI_INDIRECT_DST_V8:
6230 case AMDGPU::SI_INDIRECT_DST_V9:
6231 case AMDGPU::SI_INDIRECT_DST_V10:
6232 case AMDGPU::SI_INDIRECT_DST_V11:
6233 case AMDGPU::SI_INDIRECT_DST_V12:
6234 case AMDGPU::SI_INDIRECT_DST_V16:
6235 case AMDGPU::SI_INDIRECT_DST_V32:
6237 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6238 case AMDGPU::SI_KILL_I1_PSEUDO:
6240 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6244 Register SrcCond =
MI.getOperand(3).getReg();
6246 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6247 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6248 const auto *CondRC =
TRI->getWaveMaskRegClass();
6249 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6253 : &AMDGPU::VReg_64RegClass;
6256 : &AMDGPU::VReg_64RegClass;
6259 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6261 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6264 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6266 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6269 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6271 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6292 MI.eraseFromParent();
6295 case AMDGPU::SI_BR_UNDEF: {
6297 .
add(
MI.getOperand(0));
6299 MI.eraseFromParent();
6302 case AMDGPU::ADJCALLSTACKUP:
6303 case AMDGPU::ADJCALLSTACKDOWN: {
6310 case AMDGPU::SI_CALL_ISEL: {
6311 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6314 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6320 MI.eraseFromParent();
6323 case AMDGPU::V_ADD_CO_U32_e32:
6324 case AMDGPU::V_SUB_CO_U32_e32:
6325 case AMDGPU::V_SUBREV_CO_U32_e32: {
6327 unsigned Opc =
MI.getOpcode();
6329 bool NeedClampOperand =
false;
6330 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6332 NeedClampOperand =
true;
6336 if (
TII->isVOP3(*
I)) {
6339 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6340 if (NeedClampOperand)
6343 TII->legalizeOperands(*
I);
6345 MI.eraseFromParent();
6348 case AMDGPU::V_ADDC_U32_e32:
6349 case AMDGPU::V_SUBB_U32_e32:
6350 case AMDGPU::V_SUBBREV_U32_e32:
6353 TII->legalizeOperands(
MI);
6355 case AMDGPU::DS_GWS_INIT:
6356 case AMDGPU::DS_GWS_SEMA_BR:
6357 case AMDGPU::DS_GWS_BARRIER:
6358 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
6360 case AMDGPU::DS_GWS_SEMA_V:
6361 case AMDGPU::DS_GWS_SEMA_P:
6362 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6370 case AMDGPU::S_SETREG_B32: {
6386 const unsigned SetMask = WidthMask <<
Offset;
6389 unsigned SetDenormOp = 0;
6390 unsigned SetRoundOp = 0;
6398 SetRoundOp = AMDGPU::S_ROUND_MODE;
6399 SetDenormOp = AMDGPU::S_DENORM_MODE;
6401 SetRoundOp = AMDGPU::S_ROUND_MODE;
6403 SetDenormOp = AMDGPU::S_DENORM_MODE;
6406 if (SetRoundOp || SetDenormOp) {
6408 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6409 unsigned ImmVal = Def->getOperand(1).getImm();
6423 MI.eraseFromParent();
6432 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6436 case AMDGPU::S_INVERSE_BALLOT_U32:
6437 case AMDGPU::S_INVERSE_BALLOT_U64:
6440 MI.setDesc(
TII->get(AMDGPU::COPY));
6442 case AMDGPU::ENDPGM_TRAP: {
6444 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6464 MI.eraseFromParent();
6467 case AMDGPU::SIMULATED_TRAP: {
6468 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6470 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6471 MI.eraseFromParent();
6474 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6475 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6481 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6482 Register OriginalExec = Setup->getOperand(0).getReg();
6484 MI.getOperand(0).setReg(OriginalExec);
6521 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6525 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6552 if (!Subtarget->hasMadMacF32Insts())
6553 return Subtarget->hasFastFMAF32();
6559 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6562 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6578 switch (Ty.getScalarSizeInBits()) {
6596 if (Ty.getScalarSizeInBits() == 16)
6598 if (Ty.getScalarSizeInBits() == 32)
6599 return Subtarget->hasMadMacF32Insts() &&
6609 EVT VT =
N->getValueType(0);
6611 return Subtarget->hasMadMacF32Insts() &&
6613 if (VT == MVT::f16) {
6614 return Subtarget->hasMadF16() &&
6629 unsigned Opc =
Op.getOpcode();
6630 EVT VT =
Op.getValueType();
6631 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6632 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6633 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6634 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6635 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6636 VT == MVT::v32bf16);
6652 [[maybe_unused]]
EVT VT =
Op.getValueType();
6654 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6655 VT == MVT::v16i32) &&
6656 "Unexpected ValueType.");
6665 unsigned Opc =
Op.getOpcode();
6666 EVT VT =
Op.getValueType();
6667 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6668 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6669 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6670 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6671 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6672 VT == MVT::v32bf16);
6680 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6682 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6689 unsigned Opc =
Op.getOpcode();
6690 EVT VT =
Op.getValueType();
6691 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6692 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6693 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6694 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6695 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6696 VT == MVT::v32bf16);
6701 : std::pair(Op0, Op0);
6710 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6712 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6718 switch (
Op.getOpcode()) {
6722 return LowerBRCOND(
Op, DAG);
6724 return LowerRETURNADDR(
Op, DAG);
6727 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6728 "Load should return a value and a chain");
6732 EVT VT =
Op.getValueType();
6734 return lowerFSQRTF32(
Op, DAG);
6736 return lowerFSQRTF64(
Op, DAG);
6741 return LowerTrig(
Op, DAG);
6743 return LowerSELECT(
Op, DAG);
6745 return LowerFDIV(
Op, DAG);
6747 return LowerFFREXP(
Op, DAG);
6748 case ISD::ATOMIC_CMP_SWAP:
6749 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6751 return LowerSTORE(
Op, DAG);
6755 return LowerGlobalAddress(MFI,
Op, DAG);
6758 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6760 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6762 return LowerINTRINSIC_VOID(
Op, DAG);
6763 case ISD::ADDRSPACECAST:
6764 return lowerADDRSPACECAST(
Op, DAG);
6766 return lowerINSERT_SUBVECTOR(
Op, DAG);
6768 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6770 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6772 return lowerVECTOR_SHUFFLE(
Op, DAG);
6774 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6776 return lowerBUILD_VECTOR(
Op, DAG);
6779 return lowerFP_ROUND(
Op, DAG);
6781 return lowerTRAP(
Op, DAG);
6782 case ISD::DEBUGTRAP:
6783 return lowerDEBUGTRAP(
Op, DAG);
6792 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6793 case ISD::FMINIMUMNUM:
6794 case ISD::FMAXIMUMNUM:
6795 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6798 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6801 return lowerFLDEXP(
Op, DAG);
6818 case ISD::FMINNUM_IEEE:
6819 case ISD::FMAXNUM_IEEE:
6826 return lowerFCOPYSIGN(
Op, DAG);
6828 return lowerMUL(
Op, DAG);
6831 return lowerXMULO(
Op, DAG);
6834 return lowerXMUL_LOHI(
Op, DAG);
6835 case ISD::DYNAMIC_STACKALLOC:
6837 case ISD::STACKSAVE:
6841 case ISD::SET_ROUNDING:
6845 case ISD::FP_EXTEND:
6848 case ISD::GET_FPENV:
6850 case ISD::SET_FPENV:
6869 EVT FittingLoadVT = LoadVT;
6894 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6898 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6901SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6904 bool IsIntrinsic)
const {
6907 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6908 EVT LoadVT =
M->getValueType(0);
6910 EVT EquivLoadVT = LoadVT;
6924 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
6928 M->getMemoryVT(),
M->getMemOperand());
6939 EVT LoadVT =
M->getValueType(0);
6945 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6946 bool IsTFE =
M->getNumValues() == 3;
6959 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
6963 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
6964 M->getMemOperand(), DAG);
6968 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
6970 M->getMemOperand(), DAG);
6978 EVT VT =
N->getValueType(0);
6979 unsigned CondCode =
N->getConstantOperandVal(3);
6990 EVT CmpVT =
LHS.getValueType();
6991 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6992 unsigned PromoteOp =
7012 EVT VT =
N->getValueType(0);
7014 unsigned CondCode =
N->getConstantOperandVal(3);
7023 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7024 Src0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7025 Src1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7041 EVT VT =
N->getValueType(0);
7048 Src.getOperand(1), Src.getOperand(2));
7059 Exec = AMDGPU::EXEC_LO;
7061 Exec = AMDGPU::EXEC;
7078 EVT VT =
N->getValueType(0);
7080 unsigned IID =
N->getConstantOperandVal(0);
7081 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7082 IID == Intrinsic::amdgcn_permlanex16;
7083 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7084 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7088 unsigned SplitSize = 32;
7089 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7090 ST->hasDPALU_DPP() &&
7098 case Intrinsic::amdgcn_permlane16:
7099 case Intrinsic::amdgcn_permlanex16:
7100 case Intrinsic::amdgcn_update_dpp:
7105 case Intrinsic::amdgcn_writelane:
7108 case Intrinsic::amdgcn_readlane:
7109 case Intrinsic::amdgcn_set_inactive:
7110 case Intrinsic::amdgcn_set_inactive_chain_arg:
7111 case Intrinsic::amdgcn_mov_dpp8:
7114 case Intrinsic::amdgcn_readfirstlane:
7115 case Intrinsic::amdgcn_permlane64:
7125 if (
SDNode *GL =
N->getGluedNode()) {
7126 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7127 GL = GL->getOperand(0).getNode();
7128 Operands.push_back(DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7137 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7138 IID == Intrinsic::amdgcn_mov_dpp8 ||
7139 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7140 Src1 =
N->getOperand(2);
7141 if (IID == Intrinsic::amdgcn_writelane ||
7142 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7143 Src2 =
N->getOperand(3);
7146 if (ValSize == SplitSize) {
7156 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7161 if (IID == Intrinsic::amdgcn_writelane) {
7166 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7168 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7171 if (ValSize % SplitSize != 0)
7175 EVT VT =
N->getValueType(0);
7179 unsigned NumOperands =
N->getNumOperands();
7181 SDNode *GL =
N->getGluedNode();
7186 for (
unsigned i = 0; i != NE; ++i) {
7187 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7189 SDValue Operand =
N->getOperand(j);
7204 DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7219 if (SplitSize == 32) {
7221 return unrollLaneOp(LaneOp.
getNode());
7227 unsigned SubVecNumElt =
7231 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7232 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7236 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7241 if (IID == Intrinsic::amdgcn_writelane)
7246 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7247 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7248 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7249 EltIdx += SubVecNumElt;
7263 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7266 if (IID == Intrinsic::amdgcn_writelane)
7269 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7277 switch (
N->getOpcode()) {
7289 unsigned IID =
N->getConstantOperandVal(0);
7291 case Intrinsic::amdgcn_make_buffer_rsrc:
7292 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7294 case Intrinsic::amdgcn_cvt_pkrtz: {
7300 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7303 case Intrinsic::amdgcn_cvt_pknorm_i16:
7304 case Intrinsic::amdgcn_cvt_pknorm_u16:
7305 case Intrinsic::amdgcn_cvt_pk_i16:
7306 case Intrinsic::amdgcn_cvt_pk_u16: {
7312 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7314 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7316 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7321 EVT VT =
N->getValueType(0);
7326 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7330 case Intrinsic::amdgcn_s_buffer_load: {
7336 if (!Subtarget->hasScalarSubwordLoads())
7342 EVT VT =
Op.getValueType();
7343 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7355 if (!
Offset->isDivergent()) {
7374 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7379 case Intrinsic::amdgcn_dead: {
7380 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7391 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7392 Results.push_back(Res.getOperand(
I));
7396 Results.push_back(Res.getValue(1));
7405 EVT VT =
N->getValueType(0);
7410 EVT SelectVT = NewVT;
7411 if (NewVT.
bitsLT(MVT::i32)) {
7414 SelectVT = MVT::i32;
7420 if (NewVT != SelectVT)
7426 if (
N->getValueType(0) != MVT::v2f16)
7430 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7438 if (
N->getValueType(0) != MVT::v2f16)
7442 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7450 if (
N->getValueType(0) != MVT::f16)
7465 if (U.get() !=
Value)
7468 if (U.getUser()->getOpcode() == Opcode)
7474unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7477 case Intrinsic::amdgcn_if:
7479 case Intrinsic::amdgcn_else:
7481 case Intrinsic::amdgcn_loop:
7483 case Intrinsic::amdgcn_end_cf:
7503 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7530 SDNode *Intr = BRCOND.getOperand(1).getNode();
7547 Intr =
LHS.getNode();
7555 assert(BR &&
"brcond missing unconditional branch user");
7560 unsigned CFNode = isCFIntrinsic(Intr);
7580 Ops.push_back(Target);
7603 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7622 MVT VT =
Op.getSimpleValueType();
7625 if (
Op.getConstantOperandVal(0) != 0)
7629 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7631 if (
Info->isEntryFunction())
7648 return Op.getValueType().bitsLE(VT)
7656 EVT DstVT =
Op.getValueType();
7663 unsigned Opc =
Op.getOpcode();
7675 EVT SrcVT = Src.getValueType();
7676 EVT DstVT =
Op.getValueType();
7679 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7682 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7689 if (DstVT == MVT::f16) {
7694 if (!Subtarget->has16BitInsts()) {
7697 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7699 if (
Op->getFlags().hasApproximateFuncs()) {
7706 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7710 "custom lower FP_ROUND for f16 or bf16");
7711 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7724 EVT VT =
Op.getValueType();
7726 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7727 bool IsIEEEMode =
Info->getMode().IEEE;
7736 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7743SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7745 EVT VT =
Op.getValueType();
7747 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7748 bool IsIEEEMode =
Info->getMode().IEEE;
7753 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7761 EVT VT =
Op.getValueType();
7765 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7766 !Subtarget->hasMinimum3Maximum3F16() &&
7767 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7768 "should not need to widen f16 minimum/maximum to v2f16");
7782 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7790 EVT VT =
Op.getValueType();
7794 EVT ExpVT =
Exp.getValueType();
7795 if (ExpVT == MVT::i16)
7816 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7819 return DAG.
getNode(ISD::FLDEXP,
DL, VT,
Op.getOperand(0), TruncExp);
7823 switch (
Op->getOpcode()) {
7853 DAGCombinerInfo &DCI)
const {
7854 const unsigned Opc =
Op.getOpcode();
7862 :
Op->getOperand(0).getValueType();
7865 if (DCI.isBeforeLegalizeOps() ||
7869 auto &DAG = DCI.DAG;
7875 LHS =
Op->getOperand(1);
7876 RHS =
Op->getOperand(2);
7878 LHS =
Op->getOperand(0);
7879 RHS =
Op->getOperand(1);
7918 if (MagVT == SignVT)
7925 SDValue SignAsInt32 = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7928 SDValue SignAsHalf16 = DAG.
getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7935 EVT VT =
Op.getValueType();
7941 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
7968 if (
Op->isDivergent())
7981 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7983 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7986 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7988 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7994 EVT VT =
Op.getValueType();
8001 const APInt &
C = RHSC->getAPIntValue();
8003 if (
C.isPowerOf2()) {
8005 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
8032 if (
Op->isDivergent()) {
8036 if (Subtarget->hasSMulHi()) {
8047 if (!Subtarget->isTrapHandlerEnabled() ||
8049 return lowerTrapEndpgm(
Op, DAG);
8051 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8052 : lowerTrapHsaQueuePtr(
Op, DAG);
8062SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8064 ImplicitParameter Param)
const {
8084 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8087 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8090 if (UserSGPR == AMDGPU::NoRegister) {
8116 if (Subtarget->hasPrivEnabledTrap2NopBug())
8129 if (!Subtarget->isTrapHandlerEnabled() ||
8133 "debugtrap handler not supported",
8144SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8146 if (Subtarget->hasApertureRegs()) {
8148 ? AMDGPU::SRC_SHARED_BASE
8149 : AMDGPU::SRC_PRIVATE_BASE;
8150 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8151 !Subtarget->hasGloballyAddressableScratch()) &&
8152 "Cannot use src_private_base with globally addressable scratch!");
8173 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8177 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8179 if (UserSGPR == AMDGPU::NoRegister) {
8213 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8224 const AMDGPUTargetMachine &TM =
8227 unsigned DestAS, SrcAS;
8229 bool IsNonNull =
false;
8231 SrcAS = ASC->getSrcAddressSpace();
8232 Src = ASC->getOperand(0);
8233 DestAS = ASC->getDestAddressSpace();
8236 Op.getConstantOperandVal(0) ==
8237 Intrinsic::amdgcn_addrspacecast_nonnull);
8238 Src =
Op->getOperand(1);
8239 SrcAS =
Op->getConstantOperandVal(2);
8240 DestAS =
Op->getConstantOperandVal(3);
8253 Subtarget->hasGloballyAddressableScratch()) {
8258 AMDGPU::S_MOV_B32, SL, MVT::i32,
8259 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8267 unsigned NullVal = TM.getNullPointerValue(DestAS);
8282 Subtarget->hasGloballyAddressableScratch()) {
8291 if (Subtarget->isWave64())
8297 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8300 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8305 AMDGPU::S_MOV_B64, SL, MVT::i64,
8306 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8308 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8310 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8312 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8318 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8330 Op.getValueType() == MVT::i64) {
8331 const SIMachineFunctionInfo *
Info =
8335 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8339 Src.getValueType() == MVT::i64)
8359 EVT InsVT =
Ins.getValueType();
8367 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8372 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8374 MVT::i32, InsNumElts / 2);
8376 Vec = DAG.
getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8377 Ins = DAG.
getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8379 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8381 if (InsNumElts == 2) {
8391 return DAG.
getNode(ISD::BITCAST, SL, VecVT, Vec);
8394 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8417 if (NumElts == 4 && EltSize == 16 && KIdx) {
8425 SDValue LoVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8426 SDValue HiVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8428 unsigned Idx = KIdx->getZExtValue();
8429 bool InsertLo = Idx < 2;
8432 DAG.
getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8433 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8435 InsHalf = DAG.
getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8439 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8452 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8480 return DAG.
getNode(ISD::BITCAST, SL, VecVT, BFI);
8487 EVT ResultVT =
Op.getValueType();
8500 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8503 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8507 if (VecSize == 128) {
8515 }
else if (VecSize == 256) {
8518 for (
unsigned P = 0;
P < 4; ++
P) {
8524 Parts[0], Parts[1]));
8526 Parts[2], Parts[3]));
8532 for (
unsigned P = 0;
P < 8; ++
P) {
8539 Parts[0], Parts[1], Parts[2], Parts[3]));
8542 Parts[4], Parts[5], Parts[6], Parts[7]));
8562 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8577 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8579 return DAG.
getNode(ISD::BITCAST, SL, ResultVT, Result);
8587 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8592 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8593 !(Mask[Elt + 1] & 1);
8599 EVT ResultVT =
Op.getValueType();
8602 const int NewSrcNumElts = 2;
8604 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8620 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8642 if (ShouldUseConsecutiveExtract &&
8645 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8646 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8658 if (Idx0 >= SrcNumElts) {
8663 if (Idx1 >= SrcNumElts) {
8668 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8669 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8677 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8678 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8683 if (SubVec0 != SubVec1) {
8684 NewMaskIdx1 += NewSrcNumElts;
8691 {NewMaskIdx0, NewMaskIdx1});
8696 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8697 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8698 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8699 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8718 EVT ResultVT =
Op.getValueType();
8734 EVT VT =
Op.getValueType();
8736 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8737 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8746 return DAG.
getNode(ISD::BITCAST, SL, VT, ExtLo);
8755 return DAG.
getNode(ISD::BITCAST, SL, VT, ShlHi);
8762 return DAG.
getNode(ISD::BITCAST, SL, VT,
Or);
8771 for (
unsigned P = 0;
P < NumParts; ++
P) {
8773 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8779 return DAG.
getNode(ISD::BITCAST, SL, VT, Blend);
8792 if (!Subtarget->isAmdHsaOS())
8852 EVT PtrVT =
Op.getValueType();
8854 const GlobalValue *GV = GSD->
getGlobal();
8868 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8886 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8887 if (Subtarget->has64BitLiterals()) {
8918 MachinePointerInfo PtrInfo =
8946 SDValue Param = lowerKernargMemParameter(
8957 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
8965 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
8973 unsigned NumElts = Elts.
size();
8975 if (NumElts <= 12) {
8984 for (
unsigned i = 0; i < Elts.
size(); ++i) {
8990 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9000 EVT SrcVT = Src.getValueType();
9021 bool Unpacked,
bool IsD16,
int DMaskPop,
9022 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9026 EVT ReqRetVT = ResultTypes[0];
9028 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9029 ? (ReqRetNumElts + 1) / 2
9032 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9043 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9054 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9056 NumDataDwords - MaskPopDwords);
9061 EVT LegalReqRetVT = ReqRetVT;
9063 if (!
Data.getValueType().isInteger())
9065 Data.getValueType().changeTypeToInteger(),
Data);
9086 if (Result->getNumValues() == 1)
9093 SDValue *LWE,
bool &IsTexFail) {
9113 unsigned DimIdx,
unsigned EndIdx,
9114 unsigned NumGradients) {
9116 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9124 if (((
I + 1) >= EndIdx) ||
9125 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9126 I == DimIdx + NumGradients - 1))) {
9145 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9159 int NumVDataDwords = 0;
9160 bool AdjustRetType =
false;
9161 bool IsAtomicPacked16Bit =
false;
9164 const unsigned ArgOffset = WithChain ? 2 : 1;
9167 unsigned DMaskLanes = 0;
9169 if (BaseOpcode->Atomic) {
9170 VData =
Op.getOperand(2);
9172 IsAtomicPacked16Bit =
9173 (Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9174 Intr->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9177 if (BaseOpcode->AtomicX2) {
9184 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9185 DMask = Is64Bit ? 0xf : 0x3;
9186 NumVDataDwords = Is64Bit ? 4 : 2;
9188 DMask = Is64Bit ? 0x3 : 0x1;
9189 NumVDataDwords = Is64Bit ? 2 : 1;
9192 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9195 if (BaseOpcode->Store) {
9196 VData =
Op.getOperand(2);
9200 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9204 VData = handleD16VData(VData, DAG,
true);
9207 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9208 }
else if (!BaseOpcode->NoReturn) {
9213 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9221 (!LoadVT.
isVector() && DMaskLanes > 1))
9227 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9228 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9229 NumVDataDwords = (DMaskLanes + 1) / 2;
9231 NumVDataDwords = DMaskLanes;
9233 AdjustRetType =
true;
9237 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9244 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9245 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9247 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9249 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9250 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9254 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9260 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9264 "Bias needs to be converted to 16 bit in A16 mode");
9269 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9273 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9274 "require 16 bit args for both gradients and addresses");
9279 if (!
ST->hasA16()) {
9280 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9281 "support 16 bit addresses\n");
9291 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
9293 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9295 IntrOpcode = G16MappingInfo->
G16;
9318 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9336 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
9337 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9338 const bool UseNSA =
ST->hasNSAEncoding() &&
9339 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9340 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9341 const bool UsePartialNSA =
9342 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9345 if (UsePartialNSA) {
9347 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9348 }
else if (!UseNSA) {
9355 if (!BaseOpcode->Sampler) {
9358 uint64_t UnormConst =
9359 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9361 Unorm = UnormConst ? True : False;
9367 bool IsTexFail =
false;
9368 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9379 NumVDataDwords += 1;
9380 AdjustRetType =
true;
9385 if (AdjustRetType) {
9388 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9397 MVT::i32, NumVDataDwords)
9400 ResultTypes[0] = NewVT;
9401 if (ResultTypes.size() == 3) {
9405 ResultTypes.erase(&ResultTypes[1]);
9410 if (BaseOpcode->Atomic)
9417 if (BaseOpcode->Store || BaseOpcode->Atomic)
9418 Ops.push_back(VData);
9419 if (UsePartialNSA) {
9421 Ops.push_back(VAddr);
9425 Ops.push_back(VAddr);
9428 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9430 Ops.push_back(Rsrc);
9431 if (BaseOpcode->Sampler) {
9435 Ops.push_back(Samp);
9440 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9441 Ops.push_back(Unorm);
9443 Ops.push_back(IsA16 &&
9444 ST->hasFeature(AMDGPU::FeatureR128A16)
9448 Ops.push_back(IsA16 ? True : False);
9450 if (!Subtarget->hasGFX90AInsts())
9455 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9458 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9461 Ops.push_back(DimInfo->
DA ? True : False);
9462 if (BaseOpcode->HasD16)
9463 Ops.push_back(IsD16 ? True : False);
9465 Ops.push_back(
Op.getOperand(0));
9467 int NumVAddrDwords =
9473 NumVDataDwords, NumVAddrDwords);
9474 }
else if (IsGFX11Plus) {
9476 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9477 : AMDGPU::MIMGEncGfx11Default,
9478 NumVDataDwords, NumVAddrDwords);
9479 }
else if (IsGFX10Plus) {
9481 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9482 : AMDGPU::MIMGEncGfx10Default,
9483 NumVDataDwords, NumVAddrDwords);
9485 if (Subtarget->hasGFX90AInsts()) {
9487 NumVDataDwords, NumVAddrDwords);
9491 "requested image instruction is not supported on this GPU",
9496 for (EVT VT : OrigResultTypes) {
9497 if (VT == MVT::Other)
9498 RetValues[Idx++] =
Op.getOperand(0);
9509 NumVDataDwords, NumVAddrDwords);
9512 NumVDataDwords, NumVAddrDwords);
9519 MachineMemOperand *MemRef = MemOp->getMemOperand();
9523 if (BaseOpcode->AtomicX2) {
9528 if (BaseOpcode->NoReturn)
9531 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9532 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9545 MachinePointerInfo(),
9550 if (!
Offset->isDivergent()) {
9557 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9566 !Subtarget->hasScalarDwordx3Loads()) {
9593 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9595 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9599 unsigned NumLoads = 1;
9605 if (NumElts == 8 || NumElts == 16) {
9606 NumLoads = NumElts / 4;
9610 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9615 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9617 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9618 for (
unsigned i = 0; i < NumLoads; ++i) {
9624 if (NumElts == 8 || NumElts == 16)
9632 if (!Subtarget->hasArchitectedSGPRs())
9644 unsigned Width)
const {
9646 using namespace AMDGPU::Hwreg;
9648 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9687 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9689 EVT VT =
Op.getValueType();
9691 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9695 switch (IntrinsicID) {
9696 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9699 return getPreloadedValue(DAG, *MFI, VT,
9702 case Intrinsic::amdgcn_dispatch_ptr:
9703 case Intrinsic::amdgcn_queue_ptr: {
9704 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9706 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9711 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9714 return getPreloadedValue(DAG, *MFI, VT, RegID);
9716 case Intrinsic::amdgcn_implicitarg_ptr: {
9718 return getImplicitArgPtr(DAG,
DL);
9719 return getPreloadedValue(DAG, *MFI, VT,
9722 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9728 return getPreloadedValue(DAG, *MFI, VT,
9731 case Intrinsic::amdgcn_dispatch_id: {
9734 case Intrinsic::amdgcn_rcp:
9736 case Intrinsic::amdgcn_rsq:
9738 case Intrinsic::amdgcn_rsq_legacy:
9742 case Intrinsic::amdgcn_rcp_legacy:
9746 case Intrinsic::amdgcn_rsq_clamp: {
9757 return DAG.
getNode(ISD::FMAXNUM,
DL, VT, Tmp,
9760 case Intrinsic::r600_read_ngroups_x:
9761 if (Subtarget->isAmdHsaOS())
9764 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9767 case Intrinsic::r600_read_ngroups_y:
9768 if (Subtarget->isAmdHsaOS())
9771 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9774 case Intrinsic::r600_read_ngroups_z:
9775 if (Subtarget->isAmdHsaOS())
9778 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9781 case Intrinsic::r600_read_local_size_x:
9782 if (Subtarget->isAmdHsaOS())
9785 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9787 case Intrinsic::r600_read_local_size_y:
9788 if (Subtarget->isAmdHsaOS())
9791 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9793 case Intrinsic::r600_read_local_size_z:
9794 if (Subtarget->isAmdHsaOS())
9797 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9799 case Intrinsic::amdgcn_workgroup_id_x:
9800 return lowerWorkGroupId(DAG, *MFI, VT,
9804 case Intrinsic::amdgcn_workgroup_id_y:
9805 return lowerWorkGroupId(DAG, *MFI, VT,
9809 case Intrinsic::amdgcn_workgroup_id_z:
9810 return lowerWorkGroupId(DAG, *MFI, VT,
9814 case Intrinsic::amdgcn_cluster_id_x:
9815 return Subtarget->hasClusters()
9816 ? getPreloadedValue(DAG, *MFI, VT,
9818 : DAG.getPOISON(VT);
9819 case Intrinsic::amdgcn_cluster_id_y:
9820 return Subtarget->hasClusters()
9821 ? getPreloadedValue(DAG, *MFI, VT,
9824 case Intrinsic::amdgcn_cluster_id_z:
9825 return Subtarget->hasClusters()
9826 ? getPreloadedValue(DAG, *MFI, VT,
9829 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9830 return Subtarget->hasClusters()
9831 ? getPreloadedValue(
9835 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9836 return Subtarget->hasClusters()
9837 ? getPreloadedValue(
9841 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9842 return Subtarget->hasClusters()
9843 ? getPreloadedValue(
9847 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9848 return Subtarget->hasClusters()
9851 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9852 return Subtarget->hasClusters()
9853 ? getPreloadedValue(
9857 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9858 return Subtarget->hasClusters()
9859 ? getPreloadedValue(
9863 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9864 return Subtarget->hasClusters()
9865 ? getPreloadedValue(
9869 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9870 return Subtarget->hasClusters()
9871 ? getPreloadedValue(
9875 case Intrinsic::amdgcn_wave_id:
9876 return lowerWaveID(DAG,
Op);
9877 case Intrinsic::amdgcn_lds_kernel_id: {
9879 return getLDSKernelId(DAG,
DL);
9880 return getPreloadedValue(DAG, *MFI, VT,
9883 case Intrinsic::amdgcn_workitem_id_x:
9884 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9885 case Intrinsic::amdgcn_workitem_id_y:
9886 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9887 case Intrinsic::amdgcn_workitem_id_z:
9888 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9889 case Intrinsic::amdgcn_wavefrontsize:
9891 SDLoc(
Op), MVT::i32);
9892 case Intrinsic::amdgcn_s_buffer_load: {
9893 unsigned CPol =
Op.getConstantOperandVal(3);
9900 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9901 Op.getOperand(3), DAG);
9903 case Intrinsic::amdgcn_fdiv_fast:
9904 return lowerFDIV_FAST(
Op, DAG);
9905 case Intrinsic::amdgcn_sin:
9908 case Intrinsic::amdgcn_cos:
9911 case Intrinsic::amdgcn_mul_u24:
9914 case Intrinsic::amdgcn_mul_i24:
9918 case Intrinsic::amdgcn_log_clamp: {
9924 case Intrinsic::amdgcn_fract:
9927 case Intrinsic::amdgcn_class:
9930 case Intrinsic::amdgcn_div_fmas:
9932 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9934 case Intrinsic::amdgcn_div_fixup:
9936 Op.getOperand(2),
Op.getOperand(3));
9938 case Intrinsic::amdgcn_div_scale: {
9951 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
9954 Denominator, Numerator);
9956 case Intrinsic::amdgcn_icmp: {
9958 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
9959 Op.getConstantOperandVal(2) == 0 &&
9964 case Intrinsic::amdgcn_fcmp: {
9967 case Intrinsic::amdgcn_ballot:
9969 case Intrinsic::amdgcn_fmed3:
9971 Op.getOperand(2),
Op.getOperand(3));
9972 case Intrinsic::amdgcn_fdot2:
9974 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9975 case Intrinsic::amdgcn_fmul_legacy:
9978 case Intrinsic::amdgcn_sffbh:
9980 case Intrinsic::amdgcn_sbfe:
9982 Op.getOperand(2),
Op.getOperand(3));
9983 case Intrinsic::amdgcn_ubfe:
9985 Op.getOperand(2),
Op.getOperand(3));
9986 case Intrinsic::amdgcn_cvt_pkrtz:
9987 case Intrinsic::amdgcn_cvt_pknorm_i16:
9988 case Intrinsic::amdgcn_cvt_pknorm_u16:
9989 case Intrinsic::amdgcn_cvt_pk_i16:
9990 case Intrinsic::amdgcn_cvt_pk_u16: {
9992 EVT VT =
Op.getValueType();
9995 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9997 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9999 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10001 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10007 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10010 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10011 return DAG.
getNode(ISD::BITCAST,
DL, VT, Node);
10013 case Intrinsic::amdgcn_fmad_ftz:
10015 Op.getOperand(2),
Op.getOperand(3));
10017 case Intrinsic::amdgcn_if_break:
10019 Op->getOperand(1),
Op->getOperand(2)),
10022 case Intrinsic::amdgcn_groupstaticsize: {
10028 const GlobalValue *GV =
10034 case Intrinsic::amdgcn_is_shared:
10035 case Intrinsic::amdgcn_is_private: {
10038 DAG.
getNode(ISD::BITCAST,
DL, MVT::v2i32,
Op.getOperand(1));
10042 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10046 Subtarget->hasGloballyAddressableScratch()) {
10049 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10050 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10059 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10062 case Intrinsic::amdgcn_perm:
10064 Op.getOperand(2),
Op.getOperand(3));
10065 case Intrinsic::amdgcn_reloc_constant: {
10075 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10076 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10077 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10078 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10079 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10080 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10081 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10082 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10083 if (
Op.getOperand(4).getValueType() == MVT::i32)
10089 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10090 Op.getOperand(3), IndexKeyi32);
10092 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10093 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10094 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10095 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10096 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10097 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10098 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10099 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10100 if (
Op.getOperand(4).getValueType() == MVT::i64)
10106 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10107 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10108 Op.getOperand(6)});
10110 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10111 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10112 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10113 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10114 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10115 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10116 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10119 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10125 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10126 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10127 IndexKey, Op.getOperand(7),
10128 Op.getOperand(8)});
10130 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10131 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10132 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10133 if (
Op.getOperand(6).getValueType() == MVT::i32)
10139 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10140 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10141 IndexKeyi32, Op.getOperand(7)});
10143 case Intrinsic::amdgcn_addrspacecast_nonnull:
10144 return lowerADDRSPACECAST(
Op, DAG);
10145 case Intrinsic::amdgcn_readlane:
10146 case Intrinsic::amdgcn_readfirstlane:
10147 case Intrinsic::amdgcn_writelane:
10148 case Intrinsic::amdgcn_permlane16:
10149 case Intrinsic::amdgcn_permlanex16:
10150 case Intrinsic::amdgcn_permlane64:
10151 case Intrinsic::amdgcn_set_inactive:
10152 case Intrinsic::amdgcn_set_inactive_chain_arg:
10153 case Intrinsic::amdgcn_mov_dpp8:
10154 case Intrinsic::amdgcn_update_dpp:
10156 case Intrinsic::amdgcn_dead: {
10158 for (
const EVT ValTy :
Op.getNode()->values())
10163 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10165 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10176 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10182 unsigned NewOpcode)
const {
10186 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10187 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10205 M->getMemOperand());
10210 unsigned NewOpcode)
const {
10214 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10215 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10233 M->getMemOperand());
10238 unsigned IntrID =
Op.getConstantOperandVal(1);
10242 case Intrinsic::amdgcn_ds_ordered_add:
10243 case Intrinsic::amdgcn_ds_ordered_swap: {
10248 unsigned IndexOperand =
M->getConstantOperandVal(7);
10249 unsigned WaveRelease =
M->getConstantOperandVal(8);
10250 unsigned WaveDone =
M->getConstantOperandVal(9);
10252 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10253 IndexOperand &= ~0x3f;
10254 unsigned CountDw = 0;
10257 CountDw = (IndexOperand >> 24) & 0xf;
10258 IndexOperand &= ~(0xf << 24);
10260 if (CountDw < 1 || CountDw > 4) {
10263 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10264 DL.getDebugLoc()));
10269 if (IndexOperand) {
10272 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10275 if (WaveDone && !WaveRelease) {
10279 Fn,
"ds_ordered_count: wave_done requires wave_release",
10280 DL.getDebugLoc()));
10283 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10284 unsigned ShaderType =
10286 unsigned Offset0 = OrderedCountIndex << 2;
10287 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10290 Offset1 |= (CountDw - 1) << 6;
10293 Offset1 |= ShaderType << 2;
10295 unsigned Offset = Offset0 | (Offset1 << 8);
10302 M->getVTList(),
Ops,
M->getMemoryVT(),
10303 M->getMemOperand());
10305 case Intrinsic::amdgcn_raw_buffer_load:
10306 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10307 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10308 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10309 case Intrinsic::amdgcn_raw_buffer_load_format:
10310 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10311 const bool IsFormat =
10312 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10313 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10315 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10316 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10330 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10332 case Intrinsic::amdgcn_struct_buffer_load:
10333 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10334 case Intrinsic::amdgcn_struct_buffer_load_format:
10335 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10336 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10337 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10338 const bool IsFormat =
10339 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10340 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10342 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10343 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10358 case Intrinsic::amdgcn_raw_tbuffer_load:
10359 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10361 EVT LoadVT =
Op.getValueType();
10362 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10363 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10382 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10385 case Intrinsic::amdgcn_struct_tbuffer_load:
10386 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10388 EVT LoadVT =
Op.getValueType();
10389 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10390 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10409 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10412 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10413 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10415 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10416 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10417 return lowerStructBufferAtomicIntrin(
Op, DAG,
10419 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10420 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10422 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10423 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10424 return lowerStructBufferAtomicIntrin(
Op, DAG,
10426 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10427 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10429 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10430 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10431 return lowerStructBufferAtomicIntrin(
Op, DAG,
10433 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10434 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10436 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10437 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10439 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10440 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10442 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10443 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10445 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10446 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10448 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10449 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10451 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10452 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10454 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10455 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10457 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10458 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10460 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10461 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10463 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10464 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10466 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10467 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10469 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10470 return lowerRawBufferAtomicIntrin(
Op, DAG,
10472 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10473 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10474 return lowerStructBufferAtomicIntrin(
Op, DAG,
10476 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10477 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10479 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10480 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10482 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10483 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10484 return lowerStructBufferAtomicIntrin(
Op, DAG,
10486 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10487 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10488 return lowerStructBufferAtomicIntrin(
Op, DAG,
10490 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10491 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10492 return lowerStructBufferAtomicIntrin(
Op, DAG,
10494 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10495 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10496 return lowerStructBufferAtomicIntrin(
Op, DAG,
10498 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10501 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10502 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10504 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10505 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10507 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10508 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10510 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10511 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10513 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10514 return lowerStructBufferAtomicIntrin(
Op, DAG,
10517 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10518 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10519 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10520 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10534 EVT VT =
Op.getValueType();
10538 Op->getVTList(),
Ops, VT,
10539 M->getMemOperand());
10541 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10542 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10543 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10544 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10558 EVT VT =
Op.getValueType();
10562 Op->getVTList(),
Ops, VT,
10563 M->getMemOperand());
10565 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10566 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10568 SDValue NodePtr =
M->getOperand(2);
10569 SDValue RayExtent =
M->getOperand(3);
10570 SDValue InstanceMask =
M->getOperand(4);
10571 SDValue RayOrigin =
M->getOperand(5);
10572 SDValue RayDir =
M->getOperand(6);
10574 SDValue TDescr =
M->getOperand(8);
10579 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10584 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10585 const unsigned NumVDataDwords = 10;
10586 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10588 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10589 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10590 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10594 Ops.push_back(NodePtr);
10597 {DAG.getBitcast(MVT::i32, RayExtent),
10598 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10599 Ops.push_back(RayOrigin);
10600 Ops.push_back(RayDir);
10601 Ops.push_back(Offsets);
10602 Ops.push_back(TDescr);
10603 Ops.push_back(
M->getChain());
10606 MachineMemOperand *MemRef =
M->getMemOperand();
10610 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10612 SDValue NodePtr =
M->getOperand(2);
10613 SDValue RayExtent =
M->getOperand(3);
10614 SDValue RayOrigin =
M->getOperand(4);
10615 SDValue RayDir =
M->getOperand(5);
10616 SDValue RayInvDir =
M->getOperand(6);
10617 SDValue TDescr =
M->getOperand(7);
10624 if (!Subtarget->hasGFX10_AEncoding()) {
10634 const unsigned NumVDataDwords = 4;
10635 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10636 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10637 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10640 const unsigned BaseOpcodes[2][2] = {
10641 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10642 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10643 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10647 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10648 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10649 : AMDGPU::MIMGEncGfx10NSA,
10650 NumVDataDwords, NumVAddrDwords);
10654 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10655 : AMDGPU::MIMGEncGfx10Default,
10656 NumVDataDwords, NumVAddrDwords);
10662 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10665 if (Lanes[0].getValueSizeInBits() == 32) {
10666 for (
unsigned I = 0;
I < 3; ++
I)
10673 Ops.push_back(Lanes[2]);
10685 if (UseNSA && IsGFX11Plus) {
10686 Ops.push_back(NodePtr);
10688 Ops.push_back(RayOrigin);
10693 for (
unsigned I = 0;
I < 3; ++
I) {
10696 {DirLanes[I], InvDirLanes[I]})));
10700 Ops.push_back(RayDir);
10701 Ops.push_back(RayInvDir);
10708 Ops.push_back(NodePtr);
10711 packLanes(RayOrigin,
true);
10712 packLanes(RayDir,
true);
10713 packLanes(RayInvDir,
false);
10718 if (NumVAddrDwords > 12) {
10720 Ops.append(16 -
Ops.size(), Undef);
10726 Ops.push_back(MergedOps);
10729 Ops.push_back(TDescr);
10731 Ops.push_back(
M->getChain());
10734 MachineMemOperand *MemRef =
M->getMemOperand();
10738 case Intrinsic::amdgcn_global_atomic_fmin_num:
10739 case Intrinsic::amdgcn_global_atomic_fmax_num:
10740 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10741 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10748 unsigned Opcode = 0;
10750 case Intrinsic::amdgcn_global_atomic_fmin_num:
10751 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10752 Opcode = ISD::ATOMIC_LOAD_FMIN;
10755 case Intrinsic::amdgcn_global_atomic_fmax_num:
10756 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10757 Opcode = ISD::ATOMIC_LOAD_FMAX;
10763 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10764 Ops,
M->getMemOperand());
10766 case Intrinsic::amdgcn_s_get_barrier_state:
10767 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10774 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10775 BarID = (BarID >> 4) & 0x3F;
10776 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10779 Ops.push_back(Chain);
10781 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10782 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10790 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10798 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10799 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10800 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10804 EVT VT =
Op->getValueType(0);
10810 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10812 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10820SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10827 EVT VT = VTList.
VTs[0];
10830 bool IsTFE = VTList.
NumVTs == 3;
10833 unsigned NumOpDWords = NumValueDWords + 1;
10835 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10836 MachineMemOperand *OpDWordsMMO =
10838 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10839 OpDWordsVT, OpDWordsMMO, DAG);
10844 NumValueDWords == 1
10853 if (!Subtarget->hasDwordx3LoadStores() &&
10854 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10858 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10860 WidenedMemVT, WidenedMMO);
10870 bool ImageStore)
const {
10880 if (Subtarget->hasUnpackedD16VMem()) {
10894 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10905 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
10911 if ((NumElements % 2) == 1) {
10913 unsigned I = Elts.
size() / 2;
10929 if (NumElements == 3) {
10939 return DAG.
getNode(ISD::BITCAST,
DL, WidenedStoreVT, ZExt);
10950 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
10953 switch (IntrinsicID) {
10954 case Intrinsic::amdgcn_exp_compr: {
10955 if (!Subtarget->hasCompressedExport()) {
10958 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10970 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src0),
10971 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src1),
10980 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10984 case Intrinsic::amdgcn_struct_tbuffer_store:
10985 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10987 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
10989 VData = handleD16VData(VData, DAG);
10990 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10991 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11009 M->getMemoryVT(),
M->getMemOperand());
11012 case Intrinsic::amdgcn_raw_tbuffer_store:
11013 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11015 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11017 VData = handleD16VData(VData, DAG);
11018 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11019 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11037 M->getMemoryVT(),
M->getMemOperand());
11040 case Intrinsic::amdgcn_raw_buffer_store:
11041 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11042 case Intrinsic::amdgcn_raw_buffer_store_format:
11043 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11044 const bool IsFormat =
11045 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11046 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11053 VData = handleD16VData(VData, DAG);
11063 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11064 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11084 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11087 M->getMemoryVT(),
M->getMemOperand());
11090 case Intrinsic::amdgcn_struct_buffer_store:
11091 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11092 case Intrinsic::amdgcn_struct_buffer_store_format:
11093 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11094 const bool IsFormat =
11095 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11096 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11104 VData = handleD16VData(VData, DAG);
11114 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11115 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11134 EVT VDataType = VData.getValueType().getScalarType();
11136 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11139 M->getMemoryVT(),
M->getMemOperand());
11141 case Intrinsic::amdgcn_raw_buffer_load_lds:
11142 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11143 case Intrinsic::amdgcn_struct_buffer_load_lds:
11144 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11145 if (!Subtarget->hasVMemToLDSLoad())
11149 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11150 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11151 unsigned OpOffset = HasVIndex ? 1 : 0;
11152 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11154 unsigned Size =
Op->getConstantOperandVal(4);
11160 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11161 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11162 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11163 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11166 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11167 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11168 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11169 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11172 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11173 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11174 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11175 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11178 if (!Subtarget->hasLDSLoadB96_B128())
11180 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11181 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11182 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11183 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11186 if (!Subtarget->hasLDSLoadB96_B128())
11188 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11189 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11190 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11191 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11199 if (HasVIndex && HasVOffset)
11203 else if (HasVIndex)
11204 Ops.push_back(
Op.getOperand(5));
11205 else if (HasVOffset)
11206 Ops.push_back(VOffset);
11208 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11209 Ops.push_back(Rsrc);
11210 Ops.push_back(
Op.getOperand(6 + OpOffset));
11211 Ops.push_back(
Op.getOperand(7 + OpOffset));
11213 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11226 MachineMemOperand *LoadMMO =
M->getMemOperand();
11231 MachinePointerInfo StorePtrI = LoadPtrI;
11255 case Intrinsic::amdgcn_load_to_lds:
11256 case Intrinsic::amdgcn_global_load_lds: {
11257 if (!Subtarget->hasVMemToLDSLoad())
11261 unsigned Size =
Op->getConstantOperandVal(4);
11266 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11269 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11272 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11275 if (!Subtarget->hasLDSLoadB96_B128())
11277 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11280 if (!Subtarget->hasLDSLoadB96_B128())
11282 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11298 if (
LHS->isDivergent())
11302 RHS.getOperand(0).getValueType() == MVT::i32) {
11305 VOffset =
RHS.getOperand(0);
11309 Ops.push_back(Addr);
11317 Ops.push_back(VOffset);
11320 Ops.push_back(
Op.getOperand(5));
11321 Ops.push_back(
Op.getOperand(6));
11326 MachineMemOperand *LoadMMO =
M->getMemOperand();
11328 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11329 MachinePointerInfo StorePtrI = LoadPtrI;
11348 case Intrinsic::amdgcn_end_cf:
11350 Op->getOperand(2), Chain),
11352 case Intrinsic::amdgcn_s_barrier_init:
11353 case Intrinsic::amdgcn_s_barrier_signal_var: {
11360 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11361 ? AMDGPU::S_BARRIER_INIT_M0
11362 : AMDGPU::S_BARRIER_SIGNAL_M0;
11377 constexpr unsigned ShAmt = 16;
11384 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11389 case Intrinsic::amdgcn_s_barrier_join: {
11398 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11401 unsigned BarID = (BarVal >> 4) & 0x3F;
11404 Ops.push_back(Chain);
11406 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11416 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11422 case Intrinsic::amdgcn_s_prefetch_data: {
11425 return Op.getOperand(0);
11428 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11430 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11437 Op->getVTList(),
Ops,
M->getMemoryVT(),
11438 M->getMemOperand());
11440 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11441 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11442 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11451 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11453 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11483std::pair<SDValue, SDValue>
11513 unsigned Overflow = ImmOffset & ~MaxImm;
11514 ImmOffset -= Overflow;
11515 if ((int32_t)Overflow < 0) {
11516 Overflow += ImmOffset;
11521 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11540void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11542 Align Alignment)
const {
11544 SDLoc
DL(CombinedOffset);
11546 uint32_t
Imm =
C->getZExtValue();
11547 uint32_t SOffset, ImmOffset;
11548 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11558 uint32_t SOffset, ImmOffset;
11561 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11569 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11578SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11581 return MaybePointer;
11595 SDValue NumRecords =
Op->getOperand(3);
11601 if (Subtarget->has45BitNumRecordsBufferResource()) {
11620 SDValue ExtShiftedStrideVec =
11623 DAG.
getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11630 DAG.
getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11632 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11634 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11639 auto [LowHalf, HighHalf] =
11640 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11650 NumRecords, Flags);
11653 SDValue RsrcPtr = DAG.
getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11662 bool IsTFE)
const {
11671 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11686 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11690 LoadVal = DAG.
getNode(ISD::BITCAST,
DL, LoadVT, LoadVal);
11700 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11704 Ops[1] = BufferStoreExt;
11709 M->getMemOperand());
11734 DAGCombinerInfo &DCI)
const {
11735 SelectionDAG &DAG = DCI.DAG;
11750 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11757 "unexpected vector extload");
11770 "unexpected fp extload");
11788 DCI.AddToWorklist(Cvt.
getNode());
11793 DCI.AddToWorklist(Cvt.
getNode());
11796 Cvt = DAG.
getNode(ISD::BITCAST, SL, VT, Cvt);
11804 if (
Info.isEntryFunction())
11805 return Info.getUserSGPRInfo().hasFlatScratchInit();
11813 EVT MemVT =
Load->getMemoryVT();
11814 MachineMemOperand *MMO =
Load->getMemOperand();
11826 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11854 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11855 "Custom lowering for non-i32 vectors hasn't been implemented.");
11858 unsigned AS =
Load->getAddressSpace();
11865 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
11869 !Subtarget->hasMultiDwordFlatScratchAddressing())
11879 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
11882 Alignment >=
Align(4) && NumElements < 32) {
11884 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11896 if (NumElements > 4)
11899 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11909 switch (Subtarget->getMaxPrivateElementSize()) {
11915 if (NumElements > 2)
11920 if (NumElements > 4)
11923 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11932 auto Flags =
Load->getMemOperand()->getFlags();
11934 Load->getAlign(), Flags, &
Fast) &&
11943 MemVT, *
Load->getMemOperand())) {
11952 EVT VT =
Op.getValueType();
11979 return DAG.
getNode(ISD::BITCAST,
DL, VT, Res);
11989 EVT VT =
Op.getValueType();
11990 const SDNodeFlags
Flags =
Op->getFlags();
11992 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
11998 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12001 if (CLHS->isExactlyValue(1.0)) {
12018 if (CLHS->isExactlyValue(-1.0)) {
12027 if (!AllowInaccurateRcp &&
12028 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12042 EVT VT =
Op.getValueType();
12043 const SDNodeFlags
Flags =
Op->getFlags();
12045 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12046 if (!AllowInaccurateDiv)
12067 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12081 return DAG.
getNode(Opcode, SL, VTList,
12090 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12104 return DAG.
getNode(Opcode, SL, VTList,
12110 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12111 return FastLowered;
12114 EVT VT =
Op.getValueType();
12121 if (VT == MVT::bf16) {
12144 unsigned FMADOpCode =
12146 SDValue NegRHSExt = DAG.
getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12151 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12153 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12154 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12160 Tmp = DAG.
getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12170 SDNodeFlags
Flags =
Op->getFlags();
12177 const APFloat K0Val(0x1p+96f);
12180 const APFloat K1Val(0x1p-32f);
12207 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12208 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12209 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12214 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12215 return FastLowered;
12221 SDNodeFlags
Flags =
Op->getFlags();
12222 Flags.setNoFPExcept(
true);
12230 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12241 DAG.
getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12243 using namespace AMDGPU::Hwreg;
12244 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12248 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12249 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12252 const bool HasDynamicDenormals =
12258 if (!PreservesDenormals) {
12263 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12266 if (HasDynamicDenormals) {
12270 SavedDenormMode =
SDValue(GetReg, 0);
12276 SDNode *EnableDenorm;
12277 if (Subtarget->hasDenormModeInst()) {
12278 const SDValue EnableDenormValue =
12285 const SDValue EnableDenormValue =
12287 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12288 {EnableDenormValue,
BitField, Glue});
12298 ApproxRcp, One, NegDivScale0, Flags);
12301 ApproxRcp, Fma0, Flags);
12307 NumeratorScaled,
Mul, Flags);
12313 NumeratorScaled, Fma3, Flags);
12315 if (!PreservesDenormals) {
12316 SDNode *DisableDenorm;
12317 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12321 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12327 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12328 const SDValue DisableDenormValue =
12329 HasDynamicDenormals
12334 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12345 {Fma4, Fma1, Fma3, Scale},
Flags);
12351 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12352 return FastLowered;
12360 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12364 SDValue NegDivScale0 = DAG.
getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12384 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12393 SDValue Scale0BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12394 SDValue Scale1BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12420 EVT VT =
Op.getValueType();
12422 if (VT == MVT::f32)
12423 return LowerFDIV32(
Op, DAG);
12425 if (VT == MVT::f64)
12426 return LowerFDIV64(
Op, DAG);
12428 if (VT == MVT::f16 || VT == MVT::bf16)
12429 return LowerFDIV16(
Op, DAG);
12438 EVT ResultExpVT =
Op->getValueType(1);
12439 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12449 if (Subtarget->hasFractBug()) {
12467 EVT VT =
Store->getMemoryVT();
12469 if (VT == MVT::i1) {
12473 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12477 Store->getValue().getValueType().getScalarType() == MVT::i32);
12479 unsigned AS =
Store->getAddressSpace();
12487 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12491 !Subtarget->hasMultiDwordFlatScratchAddressing())
12498 if (NumElements > 4)
12501 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12505 VT, *
Store->getMemOperand()))
12511 switch (Subtarget->getMaxPrivateElementSize()) {
12515 if (NumElements > 2)
12519 if (NumElements > 4 ||
12520 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12528 auto Flags =
Store->getMemOperand()->getFlags();
12547 assert(!Subtarget->has16BitInsts());
12548 SDNodeFlags
Flags =
Op->getFlags();
12550 DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32,
Op.getOperand(0), Flags);
12562 SDNodeFlags
Flags =
Op->getFlags();
12563 MVT VT =
Op.getValueType().getSimpleVT();
12593 SDValue SqrtSNextDown = DAG.
getNode(ISD::BITCAST,
DL, VT, SqrtSNextDownInt);
12596 DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextDown, Flags);
12605 SDValue NegSqrtSNextUp = DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextUp, Flags);
12671 SDNodeFlags
Flags =
Op->getFlags();
12717 SqrtRet = DAG.
getNode(ISD::FLDEXP,
DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12734 EVT VT =
Op.getValueType();
12744 if (Subtarget->hasTrigReducedRange()) {
12751 switch (
Op.getOpcode()) {
12778 EVT VT =
Op.getValueType();
12786 Op->getVTList(),
Ops, VT,
12795SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12796 DAGCombinerInfo &DCI)
const {
12797 EVT VT =
N->getValueType(0);
12799 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12802 SelectionDAG &DAG = DCI.DAG;
12806 EVT SrcVT = Src.getValueType();
12812 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12815 DCI.AddToWorklist(Cvt.
getNode());
12818 if (ScalarVT != MVT::f32) {
12830 DAGCombinerInfo &DCI)
const {
12837 if (SignOp.
getOpcode() == ISD::FP_EXTEND ||
12841 SelectionDAG &DAG = DCI.DAG;
12860 for (
unsigned I = 0;
I != NumElts; ++
I) {
12884 if (NewElts.
size() == 1)
12906 for (
unsigned I = 0;
I != NumElts; ++
I) {
12941SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
12943 DAGCombinerInfo &DCI)
const {
12960 SelectionDAG &DAG = DCI.DAG;
12973 AM.BaseOffs =
Offset.getSExtValue();
12978 EVT VT =
N->getValueType(0);
12984 Flags.setNoUnsignedWrap(
12985 N->getFlags().hasNoUnsignedWrap() &&
12997 switch (
N->getOpcode()) {
13008 DAGCombinerInfo &DCI)
const {
13009 SelectionDAG &DAG = DCI.DAG;
13016 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
13017 N->getMemoryVT(), DCI);
13021 NewOps[PtrIdx] = NewPtr;
13030 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13031 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13040SDValue SITargetLowering::splitBinaryBitConstantOp(
13044 uint32_t ValLo =
Lo_32(Val);
13045 uint32_t ValHi =
Hi_32(Val);
13052 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13066 if (V.getValueType() != MVT::i1)
13068 switch (V.getOpcode()) {
13085 return V.getResNo() == 1;
13087 unsigned IntrinsicID = V.getConstantOperandVal(0);
13088 switch (IntrinsicID) {
13089 case Intrinsic::amdgcn_is_shared:
13090 case Intrinsic::amdgcn_is_private:
13107 if (!(
C & 0x000000ff))
13108 ZeroByteMask |= 0x000000ff;
13109 if (!(
C & 0x0000ff00))
13110 ZeroByteMask |= 0x0000ff00;
13111 if (!(
C & 0x00ff0000))
13112 ZeroByteMask |= 0x00ff0000;
13113 if (!(
C & 0xff000000))
13114 ZeroByteMask |= 0xff000000;
13115 uint32_t NonZeroByteMask = ~ZeroByteMask;
13116 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13129 assert(V.getValueSizeInBits() == 32);
13131 if (V.getNumOperands() != 2)
13140 switch (V.getOpcode()) {
13145 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13150 return (0x03020100 & ~ConstMask) | ConstMask;
13157 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13163 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13170 DAGCombinerInfo &DCI)
const {
13171 if (DCI.isBeforeLegalize())
13174 SelectionDAG &DAG = DCI.DAG;
13175 EVT VT =
N->getValueType(0);
13180 if (VT == MVT::i64 && CRHS) {
13182 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13186 if (CRHS && VT == MVT::i32) {
13196 unsigned Shift = CShift->getZExtValue();
13198 unsigned Offset = NB + Shift;
13199 if ((
Offset & (Bits - 1)) == 0) {
13223 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13238 if (
Y.getOpcode() != ISD::FABS ||
Y.getOperand(0) !=
X ||
13243 if (
X !=
LHS.getOperand(1))
13247 const ConstantFPSDNode *C1 =
13281 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13282 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13284 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13285 :
Mask->getZExtValue() & OrdMask;
13306 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13309 if (LHSMask != ~0u && RHSMask != ~0u) {
13312 if (LHSMask > RHSMask) {
13319 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13320 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13323 if (!(LHSUsedLanes & RHSUsedLanes) &&
13326 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13332 uint32_t
Mask = LHSMask & RHSMask;
13333 for (
unsigned I = 0;
I < 32;
I += 8) {
13334 uint32_t ByteSel = 0xff <<
I;
13335 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13336 Mask &= (0x0c <<
I) & 0xffffffff;
13341 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13394static const std::optional<ByteProvider<SDValue>>
13396 unsigned Depth = 0) {
13399 return std::nullopt;
13401 if (
Op.getValueSizeInBits() < 8)
13402 return std::nullopt;
13404 if (
Op.getValueType().isVector())
13407 switch (
Op->getOpcode()) {
13419 NarrowVT = VTSign->getVT();
13422 return std::nullopt;
13425 if (SrcIndex >= NarrowByteWidth)
13426 return std::nullopt;
13434 return std::nullopt;
13436 uint64_t BitShift = ShiftOp->getZExtValue();
13438 if (BitShift % 8 != 0)
13439 return std::nullopt;
13441 SrcIndex += BitShift / 8;
13459static const std::optional<ByteProvider<SDValue>>
13461 unsigned StartingIndex = 0) {
13465 return std::nullopt;
13467 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13469 return std::nullopt;
13471 return std::nullopt;
13473 bool IsVec =
Op.getValueType().isVector();
13474 switch (
Op.getOpcode()) {
13477 return std::nullopt;
13482 return std::nullopt;
13486 return std::nullopt;
13489 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13490 return std::nullopt;
13491 if (!
LHS ||
LHS->isConstantZero())
13493 if (!
RHS ||
RHS->isConstantZero())
13495 return std::nullopt;
13500 return std::nullopt;
13504 return std::nullopt;
13506 uint32_t BitMask = BitMaskOp->getZExtValue();
13508 uint32_t IndexMask = 0xFF << (Index * 8);
13510 if ((IndexMask & BitMask) != IndexMask) {
13513 if (IndexMask & BitMask)
13514 return std::nullopt;
13523 return std::nullopt;
13527 if (!ShiftOp ||
Op.getValueType().isVector())
13528 return std::nullopt;
13530 uint64_t BitsProvided =
Op.getValueSizeInBits();
13531 if (BitsProvided % 8 != 0)
13532 return std::nullopt;
13534 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13536 return std::nullopt;
13538 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13539 uint64_t ByteShift = BitShift / 8;
13541 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13542 uint64_t BytesProvided = BitsProvided / 8;
13543 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13544 NewIndex %= BytesProvided;
13551 return std::nullopt;
13555 return std::nullopt;
13557 uint64_t BitShift = ShiftOp->getZExtValue();
13559 return std::nullopt;
13561 auto BitsProvided =
Op.getScalarValueSizeInBits();
13562 if (BitsProvided % 8 != 0)
13563 return std::nullopt;
13565 uint64_t BytesProvided = BitsProvided / 8;
13566 uint64_t ByteShift = BitShift / 8;
13571 return BytesProvided - ByteShift > Index
13579 return std::nullopt;
13583 return std::nullopt;
13585 uint64_t BitShift = ShiftOp->getZExtValue();
13586 if (BitShift % 8 != 0)
13587 return std::nullopt;
13588 uint64_t ByteShift = BitShift / 8;
13594 return Index < ByteShift
13597 Depth + 1, StartingIndex);
13606 return std::nullopt;
13614 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13616 if (NarrowBitWidth % 8 != 0)
13617 return std::nullopt;
13618 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13620 if (Index >= NarrowByteWidth)
13622 ? std::optional<ByteProvider<SDValue>>(
13630 return std::nullopt;
13634 if (NarrowByteWidth >= Index) {
13639 return std::nullopt;
13646 return std::nullopt;
13652 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13653 if (NarrowBitWidth % 8 != 0)
13654 return std::nullopt;
13655 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13660 if (Index >= NarrowByteWidth) {
13662 ? std::optional<ByteProvider<SDValue>>(
13667 if (NarrowByteWidth > Index) {
13671 return std::nullopt;
13676 return std::nullopt;
13679 Depth + 1, StartingIndex);
13685 return std::nullopt;
13686 auto VecIdx = IdxOp->getZExtValue();
13687 auto ScalarSize =
Op.getScalarValueSizeInBits();
13688 if (ScalarSize < 32)
13689 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13691 StartingIndex, Index);
13696 return std::nullopt;
13700 return std::nullopt;
13703 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13704 if (IdxMask > 0x07 && IdxMask != 0x0c)
13705 return std::nullopt;
13707 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13708 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13710 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13716 return std::nullopt;
13731 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13738 auto MemVT = L->getMemoryVT();
13741 return L->getMemoryVT().getSizeInBits() == 16;
13751 int Low8 = Mask & 0xff;
13752 int Hi8 = (Mask & 0xff00) >> 8;
13754 assert(Low8 < 8 && Hi8 < 8);
13756 bool IsConsecutive = (Hi8 - Low8 == 1);
13761 bool Is16Aligned = !(Low8 % 2);
13763 return IsConsecutive && Is16Aligned;
13771 int Low16 = PermMask & 0xffff;
13772 int Hi16 = (PermMask & 0xffff0000) >> 16;
13782 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13784 if (!OtherOpIs16Bit)
13792 unsigned DWordOffset) {
13797 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13802 if (Src.getValueType().isVector()) {
13803 auto ScalarTySize = Src.getScalarValueSizeInBits();
13804 auto ScalarTy = Src.getValueType().getScalarType();
13805 if (ScalarTySize == 32) {
13809 if (ScalarTySize > 32) {
13812 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13813 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13820 assert(ScalarTySize < 32);
13821 auto NumElements =
TypeSize / ScalarTySize;
13822 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13823 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13824 auto NumElementsIn32 = 32 / ScalarTySize;
13825 auto NumAvailElements = DWordOffset < Trunc32Elements
13827 : NumElements - NormalizedTrunc;
13840 auto ShiftVal = 32 * DWordOffset;
13848 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13853 for (
int i = 0; i < 4; i++) {
13855 std::optional<ByteProvider<SDValue>>
P =
13858 if (!
P ||
P->isConstantZero())
13863 if (PermNodes.
size() != 4)
13866 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13867 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13869 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13870 auto PermOp = PermNodes[i];
13873 int SrcByteAdjust = 4;
13877 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13878 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13880 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13881 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13885 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13886 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13889 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13891 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13894 SDValue Op = *PermNodes[FirstSrc.first].Src;
13896 assert(
Op.getValueSizeInBits() == 32);
13900 int Low16 = PermMask & 0xffff;
13901 int Hi16 = (PermMask & 0xffff0000) >> 16;
13903 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13904 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13907 if (WellFormedLow && WellFormedHi)
13911 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
13920 assert(
Op.getValueType().isByteSized() &&
13938 DAGCombinerInfo &DCI)
const {
13939 SelectionDAG &DAG = DCI.DAG;
13943 EVT VT =
N->getValueType(0);
13944 if (VT == MVT::i1) {
13949 if (Src !=
RHS.getOperand(0))
13954 if (!CLHS || !CRHS)
13958 static const uint32_t MaxMask = 0x3ff;
13978 Sel |=
LHS.getConstantOperandVal(2);
13987 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13991 auto usesCombinedOperand = [](SDNode *OrUse) {
13993 if (OrUse->getOpcode() != ISD::BITCAST ||
13994 !OrUse->getValueType(0).isVector())
13998 for (
auto *VUser : OrUse->users()) {
13999 if (!VUser->getValueType(0).isVector())
14006 if (VUser->getOpcode() == VectorwiseOp)
14012 if (!
any_of(
N->users(), usesCombinedOperand))
14018 if (LHSMask != ~0u && RHSMask != ~0u) {
14021 if (LHSMask > RHSMask) {
14028 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14029 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14032 if (!(LHSUsedLanes & RHSUsedLanes) &&
14035 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14037 LHSMask &= ~RHSUsedLanes;
14038 RHSMask &= ~LHSUsedLanes;
14040 LHSMask |= LHSUsedLanes & 0x04040404;
14042 uint32_t Sel = LHSMask | RHSMask;
14050 if (LHSMask == ~0u || RHSMask == ~0u) {
14091 return IdentitySrc;
14097 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14112 if (SrcVT == MVT::i32) {
14117 DCI.AddToWorklist(LowOr.
getNode());
14118 DCI.AddToWorklist(HiBits.getNode());
14122 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14129 N->getOperand(0), CRHS))
14137 DAGCombinerInfo &DCI)
const {
14138 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14145 SelectionDAG &DAG = DCI.DAG;
14147 EVT VT =
N->getValueType(0);
14148 if (CRHS && VT == MVT::i64) {
14150 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14157 unsigned Opc =
LHS.getOpcode();
14181 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(1));
14183 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(2));
14187 LHS->getOperand(0), FNegLHS, FNegRHS);
14188 return DAG.
getNode(ISD::BITCAST,
DL, VT, NewSelect);
14196 DAGCombinerInfo &DCI)
const {
14197 if (!Subtarget->has16BitInsts() ||
14201 EVT VT =
N->getValueType(0);
14202 if (VT != MVT::i32)
14206 if (Src.getValueType() != MVT::i16)
14213SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14214 DAGCombinerInfo &DCI)
const {
14221 VTSign->getVT() == MVT::i8) ||
14223 VTSign->getVT() == MVT::i16))) {
14224 assert(Subtarget->hasScalarSubwordLoads() &&
14225 "s_buffer_load_{u8, i8} are supported "
14226 "in GFX12 (or newer) architectures.");
14227 EVT VT = Src.getValueType();
14232 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14239 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14240 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14245 VTSign->getVT() == MVT::i8) ||
14247 VTSign->getVT() == MVT::i16)) &&
14256 Src.getOperand(6), Src.getOperand(7)};
14259 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14263 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14264 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14265 return DCI.DAG.getMergeValues(
14266 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14272 DAGCombinerInfo &DCI)
const {
14273 SelectionDAG &DAG = DCI.DAG;
14280 if (
N->getOperand(0).isUndef())
14287 DAGCombinerInfo &DCI)
const {
14288 EVT VT =
N->getValueType(0);
14303 if ((VT == MVT::f16 && N0.
getOpcode() == ISD::FSQRT) &&
14313 unsigned MaxDepth)
const {
14314 unsigned Opcode =
Op.getOpcode();
14319 const auto &
F = CFP->getValueAPF();
14320 if (
F.isNaN() &&
F.isSignaling())
14322 if (!
F.isDenormal())
14348 case ISD::FP_EXTEND:
14349 case ISD::FP16_TO_FP:
14350 case ISD::FP_TO_FP16:
14351 case ISD::BF16_TO_FP:
14352 case ISD::FP_TO_BF16:
14385 if (
Op.getValueType() == MVT::i32) {
14391 if (RHS->getZExtValue() == 0xffff0000) {
14401 return Op.getValueType().getScalarType() != MVT::f16;
14405 case ISD::FMINNUM_IEEE:
14406 case ISD::FMAXNUM_IEEE:
14407 case ISD::FMINIMUM:
14408 case ISD::FMAXIMUM:
14409 case ISD::FMINIMUMNUM:
14410 case ISD::FMAXIMUMNUM:
14422 if (Subtarget->supportsMinMaxDenormModes() ||
14432 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14444 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14471 if (
Op.getValueType() == MVT::i16) {
14474 TruncSrc.
getOpcode() == ISD::BITCAST &&
14482 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14484 switch (IntrinsicID) {
14485 case Intrinsic::amdgcn_cvt_pkrtz:
14486 case Intrinsic::amdgcn_cubeid:
14487 case Intrinsic::amdgcn_frexp_mant:
14488 case Intrinsic::amdgcn_fdot2:
14489 case Intrinsic::amdgcn_rcp:
14490 case Intrinsic::amdgcn_rsq:
14491 case Intrinsic::amdgcn_rsq_clamp:
14492 case Intrinsic::amdgcn_rcp_legacy:
14493 case Intrinsic::amdgcn_rsq_legacy:
14494 case Intrinsic::amdgcn_trig_preop:
14495 case Intrinsic::amdgcn_tanh:
14496 case Intrinsic::amdgcn_log:
14497 case Intrinsic::amdgcn_exp2:
14498 case Intrinsic::amdgcn_sqrt:
14516 unsigned MaxDepth)
const {
14519 unsigned Opcode =
MI->getOpcode();
14521 if (Opcode == AMDGPU::G_FCANONICALIZE)
14524 std::optional<FPValueAndVReg> FCR;
14527 if (FCR->Value.isSignaling())
14529 if (!FCR->Value.isDenormal())
14540 case AMDGPU::G_FADD:
14541 case AMDGPU::G_FSUB:
14542 case AMDGPU::G_FMUL:
14543 case AMDGPU::G_FCEIL:
14544 case AMDGPU::G_FFLOOR:
14545 case AMDGPU::G_FRINT:
14546 case AMDGPU::G_FNEARBYINT:
14547 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14548 case AMDGPU::G_INTRINSIC_TRUNC:
14549 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14550 case AMDGPU::G_FMA:
14551 case AMDGPU::G_FMAD:
14552 case AMDGPU::G_FSQRT:
14553 case AMDGPU::G_FDIV:
14554 case AMDGPU::G_FREM:
14555 case AMDGPU::G_FPOW:
14556 case AMDGPU::G_FPEXT:
14557 case AMDGPU::G_FLOG:
14558 case AMDGPU::G_FLOG2:
14559 case AMDGPU::G_FLOG10:
14560 case AMDGPU::G_FPTRUNC:
14561 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14562 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14563 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14564 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14565 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14567 case AMDGPU::G_FNEG:
14568 case AMDGPU::G_FABS:
14569 case AMDGPU::G_FCOPYSIGN:
14571 case AMDGPU::G_FMINNUM:
14572 case AMDGPU::G_FMAXNUM:
14573 case AMDGPU::G_FMINNUM_IEEE:
14574 case AMDGPU::G_FMAXNUM_IEEE:
14575 case AMDGPU::G_FMINIMUM:
14576 case AMDGPU::G_FMAXIMUM:
14577 case AMDGPU::G_FMINIMUMNUM:
14578 case AMDGPU::G_FMAXIMUMNUM: {
14579 if (Subtarget->supportsMinMaxDenormModes() ||
14586 case AMDGPU::G_BUILD_VECTOR:
14591 case AMDGPU::G_INTRINSIC:
14592 case AMDGPU::G_INTRINSIC_CONVERGENT:
14594 case Intrinsic::amdgcn_fmul_legacy:
14595 case Intrinsic::amdgcn_fmad_ftz:
14596 case Intrinsic::amdgcn_sqrt:
14597 case Intrinsic::amdgcn_fmed3:
14598 case Intrinsic::amdgcn_sin:
14599 case Intrinsic::amdgcn_cos:
14600 case Intrinsic::amdgcn_log:
14601 case Intrinsic::amdgcn_exp2:
14602 case Intrinsic::amdgcn_log_clamp:
14603 case Intrinsic::amdgcn_rcp:
14604 case Intrinsic::amdgcn_rcp_legacy:
14605 case Intrinsic::amdgcn_rsq:
14606 case Intrinsic::amdgcn_rsq_clamp:
14607 case Intrinsic::amdgcn_rsq_legacy:
14608 case Intrinsic::amdgcn_div_scale:
14609 case Intrinsic::amdgcn_div_fmas:
14610 case Intrinsic::amdgcn_div_fixup:
14611 case Intrinsic::amdgcn_fract:
14612 case Intrinsic::amdgcn_cvt_pkrtz:
14613 case Intrinsic::amdgcn_cubeid:
14614 case Intrinsic::amdgcn_cubema:
14615 case Intrinsic::amdgcn_cubesc:
14616 case Intrinsic::amdgcn_cubetc:
14617 case Intrinsic::amdgcn_frexp_mant:
14618 case Intrinsic::amdgcn_fdot2:
14619 case Intrinsic::amdgcn_trig_preop:
14620 case Intrinsic::amdgcn_tanh:
14639 if (
C.isDenormal()) {
14653 if (
C.isSignaling()) {
14676SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14677 DAGCombinerInfo &DCI)
const {
14678 SelectionDAG &DAG = DCI.DAG;
14680 EVT VT =
N->getValueType(0);
14689 EVT VT =
N->getValueType(0);
14690 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14706 EVT EltVT =
Lo.getValueType();
14709 for (
unsigned I = 0;
I != 2; ++
I) {
14713 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14714 }
else if (
Op.isUndef()) {
14748 case ISD::FMAXNUM_IEEE:
14749 case ISD::FMAXIMUMNUM:
14751 case ISD::FMAXIMUM:
14758 case ISD::FMINNUM_IEEE:
14759 case ISD::FMINIMUMNUM:
14761 case ISD::FMINIMUM:
14787 if (!MinK || !MaxK)
14800 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14801 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14860 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14866 if (
Info->getMode().DX10Clamp) {
14875 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14903 case ISD::FMINNUM_IEEE:
14904 case ISD::FMAXNUM_IEEE:
14905 case ISD::FMINIMUMNUM:
14906 case ISD::FMAXIMUMNUM:
14909 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
14911 case ISD::FMINIMUM:
14912 case ISD::FMAXIMUM:
14920 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
14929 DAGCombinerInfo &DCI)
const {
14930 SelectionDAG &DAG = DCI.DAG;
14962 if (
SDValue Med3 = performIntMed3ImmCombine(
14967 if (
SDValue Med3 = performIntMed3ImmCombine(
14973 if (
SDValue Med3 = performIntMed3ImmCombine(
14978 if (
SDValue Med3 = performIntMed3ImmCombine(
14988 if (((
Opc == ISD::FMINNUM && Op0.
getOpcode() == ISD::FMAXNUM) ||
14989 (
Opc == ISD::FMINNUM_IEEE && Op0.
getOpcode() == ISD::FMAXNUM_IEEE) ||
14990 (
Opc == ISD::FMINIMUMNUM && Op0.
getOpcode() == ISD::FMAXIMUMNUM) ||
14993 (VT == MVT::f32 || VT == MVT::f64 ||
14994 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14995 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14996 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14997 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14999 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
15006 const SDNodeFlags
Flags =
N->getFlags();
15007 if ((
Opc == ISD::FMINIMUM ||
Opc == ISD::FMAXIMUM) &&
15008 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
15010 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15011 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15021 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15022 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15031 DAGCombinerInfo &DCI)
const {
15032 EVT VT =
N->getValueType(0);
15036 SelectionDAG &DAG = DCI.DAG;
15051 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15055 if (
Info->getMode().DX10Clamp) {
15075 DAGCombinerInfo &DCI)
const {
15079 return DCI.DAG.getUNDEF(
N->getValueType(0));
15087 bool IsDivergentIdx,
15092 unsigned VecSize = EltSize * NumElem;
15095 if (VecSize <= 64 && EltSize < 32)
15104 if (IsDivergentIdx)
15108 unsigned NumInsts = NumElem +
15109 ((EltSize + 31) / 32) * NumElem ;
15113 if (Subtarget->useVGPRIndexMode())
15114 return NumInsts <= 16;
15118 if (Subtarget->hasMovrel())
15119 return NumInsts <= 15;
15125 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15140SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15141 DAGCombinerInfo &DCI)
const {
15147 EVT ResVT =
N->getValueType(0);
15171 if (!
C ||
C->getZExtValue() != 0x1f)
15187 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15206 case ISD::FMAXNUM_IEEE:
15207 case ISD::FMINNUM_IEEE:
15208 case ISD::FMAXIMUM:
15209 case ISD::FMINIMUM: {
15215 DCI.AddToWorklist(Elt0.
getNode());
15216 DCI.AddToWorklist(Elt1.
getNode());
15238 if (!DCI.isBeforeLegalize())
15246 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15249 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15250 unsigned EltIdx = BitIndex / 32;
15251 unsigned LeftoverBitIdx = BitIndex % 32;
15255 DCI.AddToWorklist(Cast.
getNode());
15259 DCI.AddToWorklist(Elt.
getNode());
15262 DCI.AddToWorklist(Srl.
getNode());
15266 DCI.AddToWorklist(Trunc.
getNode());
15268 if (VecEltVT == ResVT) {
15269 return DAG.
getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15280SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15281 DAGCombinerInfo &DCI)
const {
15292 SelectionDAG &DAG = DCI.DAG;
15311 if (Src.getOpcode() == ISD::FP_EXTEND &&
15312 Src.getOperand(0).getValueType() == MVT::f16) {
15313 return Src.getOperand(0);
15317 APFloat Val = CFP->getValueAPF();
15318 bool LosesInfo =
true;
15328 DAGCombinerInfo &DCI)
const {
15329 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15330 "combine only useful on gfx8");
15332 SDValue TruncSrc =
N->getOperand(0);
15333 EVT VT =
N->getValueType(0);
15334 if (VT != MVT::f16)
15341 SelectionDAG &DAG = DCI.DAG;
15369 return DAG.
getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15372unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15374 const SDNode *N1)
const {
15379 if (((VT == MVT::f32 &&
15381 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15401 EVT VT =
N->getValueType(0);
15402 if (VT != MVT::i32 && VT != MVT::i64)
15408 unsigned Opc =
N->getOpcode();
15463 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15482 DAGCombinerInfo &DCI)
const {
15485 SelectionDAG &DAG = DCI.DAG;
15486 EVT VT =
N->getValueType(0);
15496 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15500 if (NumBits <= 32 || NumBits > 64)
15511 if (!Subtarget->hasFullRate64Ops()) {
15512 unsigned NumUsers = 0;
15513 for (SDNode *User :
LHS->
users()) {
15516 if (!
User->isAnyAdd())
15540 bool MulSignedLo =
false;
15541 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15550 if (VT != MVT::i64) {
15573 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15575 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15576 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15578 if (!MulLHSUnsigned32) {
15585 if (!MulRHSUnsigned32) {
15596 if (VT != MVT::i64)
15602SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15603 DAGCombinerInfo &DCI)
const {
15613 SelectionDAG &DAG = DCI.DAG;
15628 unsigned Opcode =
N->getOpcode();
15629 if (Opcode == ISD::PTRADD)
15632 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15643static std::optional<ByteProvider<SDValue>>
15646 if (!Byte0 || Byte0->isConstantZero()) {
15647 return std::nullopt;
15650 if (Byte1 && !Byte1->isConstantZero()) {
15651 return std::nullopt;
15657 unsigned FirstCs =
First & 0x0c0c0c0c;
15658 unsigned SecondCs = Second & 0x0c0c0c0c;
15659 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15660 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15662 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15663 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15664 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15665 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15667 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15691 for (
int BPI = 0; BPI < 2; BPI++) {
15694 BPP = {Src1, Src0};
15696 unsigned ZeroMask = 0x0c0c0c0c;
15697 unsigned FMask = 0xFF << (8 * (3 - Step));
15699 unsigned FirstMask =
15700 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15701 unsigned SecondMask =
15702 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15706 int FirstGroup = -1;
15707 for (
int I = 0;
I < 2;
I++) {
15709 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15710 return IterElt.SrcOp == *BPP.first.Src &&
15711 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15715 if (Match != Srcs.
end()) {
15716 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15721 if (FirstGroup != -1) {
15723 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15724 return IterElt.SrcOp == *BPP.second.Src &&
15725 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15728 if (Match != Srcs.
end()) {
15729 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15731 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15739 unsigned ZeroMask = 0x0c0c0c0c;
15740 unsigned FMask = 0xFF << (8 * (3 - Step));
15744 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15748 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15757 if (Srcs.
size() == 1) {
15758 auto *Elt = Srcs.
begin();
15762 if (Elt->PermMask == 0x3020100)
15769 auto *FirstElt = Srcs.
begin();
15770 auto *SecondElt = std::next(FirstElt);
15777 auto FirstMask = FirstElt->PermMask;
15778 auto SecondMask = SecondElt->PermMask;
15780 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15781 unsigned FirstPlusFour = FirstMask | 0x04040404;
15784 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15796 FirstElt = std::next(SecondElt);
15797 if (FirstElt == Srcs.
end())
15800 SecondElt = std::next(FirstElt);
15803 if (SecondElt == Srcs.
end()) {
15809 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15815 return Perms.
size() == 2
15821 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15822 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15823 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15824 EntryMask += ZeroMask;
15829 auto Opcode =
Op.getOpcode();
15835static std::optional<bool>
15846 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15849 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15851 assert(!(S0IsUnsigned && S0IsSigned));
15852 assert(!(S1IsUnsigned && S1IsSigned));
15860 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15866 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15867 return std::nullopt;
15879 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15880 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15885 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15891 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15892 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15893 return std::nullopt;
15899 DAGCombinerInfo &DCI)
const {
15900 SelectionDAG &DAG = DCI.DAG;
15901 EVT VT =
N->getValueType(0);
15907 if (Subtarget->hasMad64_32()) {
15908 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15913 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
15917 if (VT == MVT::i64) {
15918 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15923 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15925 std::optional<bool> IsSigned;
15931 int ChainLength = 0;
15932 for (
int I = 0;
I < 4;
I++) {
15936 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15939 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15944 TempNode->getOperand(MulIdx), *Src0, *Src1,
15945 TempNode->getOperand(MulIdx)->getOperand(0),
15946 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15950 IsSigned = *IterIsSigned;
15951 if (*IterIsSigned != *IsSigned)
15954 auto AddIdx = 1 - MulIdx;
15957 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
15958 Src2s.
push_back(TempNode->getOperand(AddIdx));
15968 TempNode->getOperand(AddIdx), *Src0, *Src1,
15969 TempNode->getOperand(AddIdx)->getOperand(0),
15970 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15974 if (*IterIsSigned != *IsSigned)
15978 ChainLength =
I + 2;
15982 TempNode = TempNode->getOperand(AddIdx);
15984 ChainLength =
I + 1;
15985 if (TempNode->getNumOperands() < 2)
15987 LHS = TempNode->getOperand(0);
15988 RHS = TempNode->getOperand(1);
15991 if (ChainLength < 2)
15997 if (ChainLength < 4) {
16007 bool UseOriginalSrc =
false;
16008 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
16009 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
16010 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16011 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16012 SmallVector<unsigned, 4> SrcBytes;
16013 auto Src0Mask = Src0s.
begin()->PermMask;
16014 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16015 bool UniqueEntries =
true;
16016 for (
auto I = 1;
I < 4;
I++) {
16017 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16020 UniqueEntries =
false;
16026 if (UniqueEntries) {
16027 UseOriginalSrc =
true;
16029 auto *FirstElt = Src0s.
begin();
16033 auto *SecondElt = Src1s.
begin();
16035 SecondElt->DWordOffset);
16044 if (!UseOriginalSrc) {
16051 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16054 : Intrinsic::amdgcn_udot4,
16064 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16069 unsigned Opc =
LHS.getOpcode();
16081 auto Cond =
RHS.getOperand(0);
16086 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16103 DAGCombinerInfo &DCI)
const {
16104 SelectionDAG &DAG = DCI.DAG;
16106 EVT VT =
N->getValueType(0);
16119 SDNodeFlags ShlFlags = N1->
getFlags();
16123 SDNodeFlags NewShlFlags =
16128 DCI.AddToWorklist(Inner.
getNode());
16135 if (Subtarget->hasMad64_32()) {
16136 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16145 if (VT == MVT::i64) {
16146 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16159 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16160 Y->isDivergent() !=
Z->isDivergent()) {
16169 if (
Y->isDivergent())
16172 SDNodeFlags ReassocFlags =
16175 DCI.AddToWorklist(UniformInner.
getNode());
16183 DAGCombinerInfo &DCI)
const {
16184 SelectionDAG &DAG = DCI.DAG;
16185 EVT VT =
N->getValueType(0);
16187 if (VT == MVT::i64) {
16188 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16192 if (VT != MVT::i32)
16201 unsigned Opc =
RHS.getOpcode();
16208 auto Cond =
RHS.getOperand(0);
16213 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16231SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16232 DAGCombinerInfo &DCI)
const {
16234 if (
N->getValueType(0) != MVT::i32)
16240 SelectionDAG &DAG = DCI.DAG;
16245 unsigned LHSOpc =
LHS.getOpcode();
16246 unsigned Opc =
N->getOpcode();
16250 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16256 DAGCombinerInfo &DCI)
const {
16260 SelectionDAG &DAG = DCI.DAG;
16261 EVT VT =
N->getValueType(0);
16273 if (
A ==
LHS.getOperand(1)) {
16274 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16275 if (FusedOp != 0) {
16277 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16285 if (
A ==
RHS.getOperand(1)) {
16286 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16287 if (FusedOp != 0) {
16289 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16298 DAGCombinerInfo &DCI)
const {
16302 SelectionDAG &DAG = DCI.DAG;
16304 EVT VT =
N->getValueType(0);
16317 if (
A ==
LHS.getOperand(1)) {
16318 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16319 if (FusedOp != 0) {
16323 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16332 if (
A ==
RHS.getOperand(1)) {
16333 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16334 if (FusedOp != 0) {
16336 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16345 DAGCombinerInfo &DCI)
const {
16346 SelectionDAG &DAG = DCI.DAG;
16348 EVT VT =
N->getValueType(0);
16349 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16355 SDNodeFlags
Flags =
N->getFlags();
16356 SDNodeFlags RHSFlags =
RHS->getFlags();
16362 bool IsNegative =
false;
16363 if (CLHS->isExactlyValue(1.0) ||
16364 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16367 if (
RHS.getOpcode() == ISD::FSQRT) {
16371 return IsNegative ? DAG.
getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16380 DAGCombinerInfo &DCI)
const {
16381 SelectionDAG &DAG = DCI.DAG;
16382 EVT VT =
N->getValueType(0);
16386 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16387 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16402 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16407 const ConstantFPSDNode *FalseNode =
16417 if (ScalarVT == MVT::f32 &&
16423 if (TrueNodeExpVal == INT_MIN)
16426 if (FalseNodeExpVal == INT_MIN)
16439 return DAG.
getNode(ISD::FLDEXP, SL, VT,
LHS, SelectNode,
N->getFlags());
16446 DAGCombinerInfo &DCI)
const {
16447 SelectionDAG &DAG = DCI.DAG;
16448 EVT VT =
N->getValueType(0);
16451 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16469 (
N->getFlags().hasAllowContract() &&
16470 FMA->getFlags().hasAllowContract())) {
16485 if (FMAOp1.
getOpcode() != ISD::FP_EXTEND ||
16504 if (Vec1 == Vec2 || Vec3 == Vec4)
16510 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16519 DAGCombinerInfo &DCI)
const {
16520 SelectionDAG &DAG = DCI.DAG;
16525 EVT VT =
LHS.getValueType();
16554 return LHS.getOperand(0);
16562 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16569 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16570 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16578 return LHS.getOperand(0);
16582 if (VT != MVT::f32 && VT != MVT::f64 &&
16583 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16591 LHS.getOpcode() == ISD::FABS) {
16598 const unsigned IsInfMask =
16600 const unsigned IsFiniteMask =
16614SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16615 DAGCombinerInfo &DCI)
const {
16616 SelectionDAG &DAG = DCI.DAG;
16637 unsigned ShiftOffset = 8 *
Offset;
16639 ShiftOffset -=
C->getZExtValue();
16641 ShiftOffset +=
C->getZExtValue();
16643 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16645 MVT::f32, Shifted);
16656 DCI.AddToWorklist(
N);
16663 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16669 DAGCombinerInfo &DCI)
const {
16674 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16678 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16679 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16682 APFloat One(
F.getSemantics(),
"1.0");
16684 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16690 DAGCombinerInfo &DCI)
const {
16711 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16712 bool isInteger =
LHS.getValueType().isInteger();
16715 if (!isFloatingPoint && !isInteger)
16720 if (!isEquality && !isNonEquality)
16737 if (isFloatingPoint) {
16739 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16750 if (!(isEquality && TrueVal == ConstVal) &&
16751 !(isNonEquality && FalseVal == ConstVal))
16758 SelectLHS, SelectRHS);
16763 switch (
N->getOpcode()) {
16779 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16789 switch (
N->getOpcode()) {
16791 return performAddCombine(
N, DCI);
16793 return performPtrAddCombine(
N, DCI);
16795 return performSubCombine(
N, DCI);
16798 return performAddCarrySubCarryCombine(
N, DCI);
16800 return performFAddCombine(
N, DCI);
16802 return performFSubCombine(
N, DCI);
16804 return performFDivCombine(
N, DCI);
16806 return performFMulCombine(
N, DCI);
16808 return performSetCCCombine(
N, DCI);
16810 if (
auto Res = performSelectCombine(
N, DCI))
16815 case ISD::FMAXNUM_IEEE:
16816 case ISD::FMINNUM_IEEE:
16817 case ISD::FMAXIMUM:
16818 case ISD::FMINIMUM:
16819 case ISD::FMAXIMUMNUM:
16820 case ISD::FMINIMUMNUM:
16827 return performMinMaxCombine(
N, DCI);
16829 return performFMACombine(
N, DCI);
16831 return performAndCombine(
N, DCI);
16833 return performOrCombine(
N, DCI);
16836 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16837 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16843 return performXorCombine(
N, DCI);
16845 return performZeroExtendCombine(
N, DCI);
16847 return performSignExtendInRegCombine(
N, DCI);
16849 return performClassCombine(
N, DCI);
16851 return performFCanonicalizeCombine(
N, DCI);
16853 return performRcpCombine(
N, DCI);
16868 return performUCharToFloatCombine(
N, DCI);
16870 return performFCopySignCombine(
N, DCI);
16875 return performCvtF32UByteNCombine(
N, DCI);
16877 return performFMed3Combine(
N, DCI);
16879 return performCvtPkRTZCombine(
N, DCI);
16881 return performClampCombine(
N, DCI);
16884 EVT VT =
N->getValueType(0);
16887 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16890 EVT EltVT = Src.getValueType();
16891 if (EltVT != MVT::i16)
16892 Src = DAG.
getNode(ISD::BITCAST, SL, MVT::i16, Src);
16895 return DAG.
getNode(ISD::BITCAST, SL, VT, Ext);
16901 return performExtractVectorEltCombine(
N, DCI);
16903 return performInsertVectorEltCombine(
N, DCI);
16905 return performFPRoundCombine(
N, DCI);
16914 return performMemSDNodeCombine(MemNode, DCI);
16945 unsigned Opcode =
Node->getMachineOpcode();
16948 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16949 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
16952 SDNode *
Users[5] = {
nullptr};
16954 unsigned DmaskIdx =
16955 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16956 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
16957 unsigned NewDmask = 0;
16958 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16959 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16960 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
16961 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
16962 unsigned TFCLane = 0;
16963 bool HasChain =
Node->getNumValues() > 1;
16965 if (OldDmask == 0) {
16973 TFCLane = OldBitsSet;
16977 for (SDUse &Use :
Node->uses()) {
16980 if (
Use.getResNo() != 0)
16983 SDNode *
User =
Use.getUser();
16986 if (!
User->isMachineOpcode() ||
16987 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16999 if (UsesTFC && Lane == TFCLane) {
17004 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17006 Dmask &= ~(1 << Comp);
17014 NewDmask |= 1 << Comp;
17019 bool NoChannels = !NewDmask;
17026 if (OldBitsSet == 1)
17032 if (NewDmask == OldDmask)
17041 unsigned NewChannels = BitsSet + UsesTFC;
17045 assert(NewOpcode != -1 &&
17046 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17047 "failed to find equivalent MIMG op");
17055 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17057 MVT ResultVT = NewChannels == 1
17060 : NewChannels == 5 ? 8
17062 SDVTList NewVTList =
17065 MachineSDNode *NewNode =
17074 if (NewChannels == 1) {
17084 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17089 if (i || !NoChannels)
17094 if (NewUser != User) {
17104 Idx = AMDGPU::sub1;
17107 Idx = AMDGPU::sub2;
17110 Idx = AMDGPU::sub3;
17113 Idx = AMDGPU::sub4;
17124 Op =
Op.getOperand(0);
17145 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17149 Node->getOperand(0), SL, VReg, SrcVal,
17155 return ToResultReg.
getNode();
17160 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17162 Ops.push_back(
Node->getOperand(i));
17168 Node->getOperand(i).getValueType(),
17169 Node->getOperand(i)),
17181 unsigned Opcode =
Node->getMachineOpcode();
17183 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17184 !
TII->isGather4(Opcode) &&
17186 return adjustWritemask(
Node, DAG);
17189 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17195 case AMDGPU::V_DIV_SCALE_F32_e64:
17196 case AMDGPU::V_DIV_SCALE_F64_e64: {
17206 (Src0 == Src1 || Src0 == Src2))
17262 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17263 unsigned InitIdx = 0;
17265 if (
TII->isImage(
MI)) {
17273 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17274 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17275 unsigned D16Val = D16 ? D16->getImm() : 0;
17277 if (!TFEVal && !LWEVal)
17288 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17290 unsigned dmask = MO_Dmask->
getImm();
17295 bool Packed = !Subtarget->hasUnpackedD16VMem();
17297 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17303 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17304 if (DstSize < InitIdx)
17307 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17315 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17316 unsigned NewDst = 0;
17321 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17322 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17325 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17326 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17346 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17359 if (
TII->isVOP3(
MI.getOpcode())) {
17361 TII->legalizeOperandsVOP3(
MRI,
MI);
17366 if (!
MI.getDesc().operands().empty()) {
17367 unsigned Opc =
MI.getOpcode();
17368 bool HasAGPRs = Info->mayNeedAGPRs();
17370 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
17372 {AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0),
17373 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1), Src2Idx}) {
17376 if ((
I == Src2Idx) && (HasAGPRs))
17379 if (!
Op.isReg() || !
Op.getReg().isVirtual())
17381 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
17382 if (!
TRI->hasAGPRs(RC))
17384 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
17385 if (!Src || !Src->isCopy() ||
17386 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
17388 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
17392 MRI.setRegClass(
Op.getReg(), NewRC);
17395 if (
TII->isMAI(
MI)) {
17400 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17401 AMDGPU::OpName::scale_src0);
17402 if (Src0Idx != -1) {
17403 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17404 AMDGPU::OpName::scale_src1);
17405 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17406 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17407 TII->legalizeOpWithMove(
MI, Src1Idx);
17415 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
17416 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17417 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
17418 if (
TRI->isVectorSuperClass(RC)) {
17419 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
17420 MRI.setRegClass(Src2->getReg(), NewRC);
17421 if (Src2->isTied())
17422 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
17431 if (
TII->isImage(
MI))
17432 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17506std::pair<unsigned, const TargetRegisterClass *>
17513 if (Constraint.
size() == 1) {
17517 if (VT == MVT::Other)
17520 switch (Constraint[0]) {
17527 RC = &AMDGPU::SReg_32RegClass;
17530 RC = &AMDGPU::SGPR_64RegClass;
17535 return std::pair(0U,
nullptr);
17542 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17543 : &AMDGPU::VGPR_32_Lo256RegClass;
17546 RC = Subtarget->has1024AddressableVGPRs()
17547 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17550 return std::pair(0U,
nullptr);
17555 if (!Subtarget->hasMAIInsts())
17559 RC = &AMDGPU::AGPR_32RegClass;
17564 return std::pair(0U,
nullptr);
17569 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17573 RC = &AMDGPU::AV_32RegClass;
17576 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17578 return std::pair(0U,
nullptr);
17587 return std::pair(0U, RC);
17590 if (Kind !=
'\0') {
17592 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17593 }
else if (Kind ==
's') {
17594 RC = &AMDGPU::SGPR_32RegClass;
17595 }
else if (Kind ==
'a') {
17596 RC = &AMDGPU::AGPR_32RegClass;
17602 return std::pair(0U,
nullptr);
17608 return std::pair(0U,
nullptr);
17612 RC =
TRI->getVGPRClassForBitWidth(Width);
17614 RC =
TRI->getSGPRClassForBitWidth(Width);
17616 RC =
TRI->getAGPRClassForBitWidth(Width);
17618 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17623 return std::pair(0U,
nullptr);
17625 return std::pair(Reg, RC);
17631 return std::pair(0U,
nullptr);
17632 if (Idx < RC->getNumRegs())
17634 return std::pair(0U,
nullptr);
17640 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17646 if (Constraint.
size() == 1) {
17647 switch (Constraint[0]) {
17657 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17665 if (Constraint.
size() == 1) {
17666 switch (Constraint[0]) {
17674 }
else if (Constraint.
size() == 2) {
17675 if (Constraint ==
"VA")
17693 std::vector<SDValue> &
Ops,
17708 unsigned Size =
Op.getScalarValueSizeInBits();
17712 if (
Size == 16 && !Subtarget->has16BitInsts())
17716 Val =
C->getSExtValue();
17720 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17724 if (
Size != 16 ||
Op.getNumOperands() != 2)
17726 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17729 Val =
C->getSExtValue();
17733 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17743 if (Constraint.
size() == 1) {
17744 switch (Constraint[0]) {
17759 }
else if (Constraint.
size() == 2) {
17760 if (Constraint ==
"DA") {
17761 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17762 int64_t LoBits =
static_cast<int32_t
>(Val);
17766 if (Constraint ==
"DB") {
17774 unsigned MaxSize)
const {
17775 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17776 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17778 MVT VT =
Op.getSimpleValueType();
17803 switch (UnalignedClassID) {
17804 case AMDGPU::VReg_64RegClassID:
17805 return AMDGPU::VReg_64_Align2RegClassID;
17806 case AMDGPU::VReg_96RegClassID:
17807 return AMDGPU::VReg_96_Align2RegClassID;
17808 case AMDGPU::VReg_128RegClassID:
17809 return AMDGPU::VReg_128_Align2RegClassID;
17810 case AMDGPU::VReg_160RegClassID:
17811 return AMDGPU::VReg_160_Align2RegClassID;
17812 case AMDGPU::VReg_192RegClassID:
17813 return AMDGPU::VReg_192_Align2RegClassID;
17814 case AMDGPU::VReg_224RegClassID:
17815 return AMDGPU::VReg_224_Align2RegClassID;
17816 case AMDGPU::VReg_256RegClassID:
17817 return AMDGPU::VReg_256_Align2RegClassID;
17818 case AMDGPU::VReg_288RegClassID:
17819 return AMDGPU::VReg_288_Align2RegClassID;
17820 case AMDGPU::VReg_320RegClassID:
17821 return AMDGPU::VReg_320_Align2RegClassID;
17822 case AMDGPU::VReg_352RegClassID:
17823 return AMDGPU::VReg_352_Align2RegClassID;
17824 case AMDGPU::VReg_384RegClassID:
17825 return AMDGPU::VReg_384_Align2RegClassID;
17826 case AMDGPU::VReg_512RegClassID:
17827 return AMDGPU::VReg_512_Align2RegClassID;
17828 case AMDGPU::VReg_1024RegClassID:
17829 return AMDGPU::VReg_1024_Align2RegClassID;
17830 case AMDGPU::AReg_64RegClassID:
17831 return AMDGPU::AReg_64_Align2RegClassID;
17832 case AMDGPU::AReg_96RegClassID:
17833 return AMDGPU::AReg_96_Align2RegClassID;
17834 case AMDGPU::AReg_128RegClassID:
17835 return AMDGPU::AReg_128_Align2RegClassID;
17836 case AMDGPU::AReg_160RegClassID:
17837 return AMDGPU::AReg_160_Align2RegClassID;
17838 case AMDGPU::AReg_192RegClassID:
17839 return AMDGPU::AReg_192_Align2RegClassID;
17840 case AMDGPU::AReg_256RegClassID:
17841 return AMDGPU::AReg_256_Align2RegClassID;
17842 case AMDGPU::AReg_512RegClassID:
17843 return AMDGPU::AReg_512_Align2RegClassID;
17844 case AMDGPU::AReg_1024RegClassID:
17845 return AMDGPU::AReg_1024_Align2RegClassID;
17861 if (Info->isEntryFunction()) {
17868 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17870 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17871 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17872 &AMDGPU::SGPR_64RegClass);
17873 Info->setSGPRForEXECCopy(SReg);
17875 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
17876 Info->getStackPtrOffsetReg()));
17877 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17878 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17882 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17883 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17885 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17886 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17888 Info->limitOccupancy(MF);
17890 if (ST.isWave32() && !MF.
empty()) {
17891 for (
auto &
MBB : MF) {
17892 for (
auto &
MI :
MBB) {
17893 TII->fixImplicitOperands(
MI);
17903 if (ST.needsAlignedVGPRs()) {
17904 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
17910 if (NewClassID != -1)
17911 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
17920 const APInt &DemandedElts,
17922 unsigned Depth)
const {
17924 unsigned Opc =
Op.getOpcode();
17927 unsigned IID =
Op.getConstantOperandVal(0);
17929 case Intrinsic::amdgcn_mbcnt_lo:
17930 case Intrinsic::amdgcn_mbcnt_hi: {
17936 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17946 Op, Known, DemandedElts, DAG,
Depth);
17962 unsigned MaxValue =
17969 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
17973 unsigned Src1Cst = 0;
17974 if (Src1.
isImm()) {
17975 Src1Cst = Src1.
getImm();
17976 }
else if (Src1.
isReg()) {
17980 Src1Cst = Cst->Value.getZExtValue();
17991 if (Width >= BFEWidth)
18000 Known = Known.
sext(BFEWidth);
18002 Known = Known.
zext(BFEWidth);
18008 unsigned Depth)
const {
18011 switch (
MI->getOpcode()) {
18012 case AMDGPU::S_BFE_I32:
18015 case AMDGPU::S_BFE_U32:
18018 case AMDGPU::S_BFE_I64:
18021 case AMDGPU::S_BFE_U64:
18024 case AMDGPU::G_INTRINSIC:
18025 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18028 case Intrinsic::amdgcn_workitem_id_x:
18031 case Intrinsic::amdgcn_workitem_id_y:
18034 case Intrinsic::amdgcn_workitem_id_z:
18037 case Intrinsic::amdgcn_mbcnt_lo:
18038 case Intrinsic::amdgcn_mbcnt_hi: {
18050 case Intrinsic::amdgcn_groupstaticsize: {
18061 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18064 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18067 case AMDGPU::G_AMDGPU_SMED3:
18068 case AMDGPU::G_AMDGPU_UMED3: {
18069 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18096 unsigned Depth)
const {
18103 AttributeList Attrs =
18105 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18132 if (Header->getAlignment() != PrefAlign)
18133 return Header->getAlignment();
18135 unsigned LoopSize = 0;
18140 LoopSize +=
MBB->getAlignment().value() / 2;
18143 LoopSize +=
TII->getInstSizeInBytes(
MI);
18144 if (LoopSize > 192)
18149 if (LoopSize <= 64)
18152 if (LoopSize <= 128)
18153 return CacheLineAlign;
18159 auto I = Exit->getFirstNonDebugInstr();
18160 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18161 return CacheLineAlign;
18170 if (PreTerm == Pre->
begin() ||
18171 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18175 auto ExitHead = Exit->getFirstNonDebugInstr();
18176 if (ExitHead == Exit->end() ||
18177 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18182 return CacheLineAlign;
18190 N =
N->getOperand(0).getNode();
18191 if (
N->getOpcode() == ISD::INLINEASM ||
N->getOpcode() == ISD::INLINEASM_BR)
18200 switch (
N->getOpcode()) {
18208 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18209 return !
TRI->isSGPRReg(
MRI, Reg);
18215 return !
TRI->isSGPRReg(
MRI, Reg);
18219 unsigned AS = L->getAddressSpace();
18223 case ISD::CALLSEQ_END:
18252 return A->readMem() &&
A->writeMem();
18273 switch (Ty.getScalarSizeInBits()) {
18285 const APInt &DemandedElts,
18288 unsigned Depth)
const {
18293 if (Info->getMode().DX10Clamp)
18305 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18325 <<
"Hardware instruction generated for atomic "
18327 <<
" operation at memory scope " << MemScope;
18332 Type *EltTy = VT->getElementType();
18333 return VT->getNumElements() == 2 &&
18353 unsigned BW =
IT->getBitWidth();
18354 return BW == 32 || BW == 64;
18368 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18369 return BW == 32 || BW == 64;
18372 if (Ty->isFloatTy() || Ty->isDoubleTy())
18376 return VT->getNumElements() == 2 &&
18377 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18387 bool HasSystemScope) {
18394 if (HasSystemScope) {
18403 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18416 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18442 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18455 bool HasSystemScope =
18481 if (Subtarget->hasEmulatedSystemScopeAtomics())
18497 if (!HasSystemScope &&
18498 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18510 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18518 ConstVal && ConstVal->isNullValue())
18556 if (Ty->isFloatTy()) {
18561 if (Ty->isDoubleTy()) {
18582 if (Ty->isFloatTy() &&
18583 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18596 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18600 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18604 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18609 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18614 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18618 if (Ty->isFloatTy()) {
18621 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18624 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18629 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18637 if (Subtarget->hasFlatAtomicFaddF32Inst())
18646 if (Subtarget->hasLDSFPAtomicAddF32()) {
18647 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18649 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18677 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18679 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18683 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18685 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18738 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18739 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18740 : &AMDGPU::SReg_32RegClass;
18741 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18742 return TRI->getEquivalentSGPRClass(RC);
18743 if (
TRI->isSGPRClass(RC) && isDivergent)
18744 return TRI->getEquivalentVGPRClass(RC);
18756 unsigned WaveSize) {
18761 if (!
IT ||
IT->getBitWidth() != WaveSize)
18766 if (!Visited.
insert(V).second)
18768 bool Result =
false;
18769 for (
const auto *U : V->users()) {
18771 if (V == U->getOperand(1)) {
18776 case Intrinsic::amdgcn_if_break:
18777 case Intrinsic::amdgcn_if:
18778 case Intrinsic::amdgcn_else:
18783 if (V == U->getOperand(0)) {
18788 case Intrinsic::amdgcn_end_cf:
18789 case Intrinsic::amdgcn_loop:
18795 Result =
hasCFUser(U, Visited, WaveSize);
18804 const Value *V)
const {
18806 if (CI->isInlineAsm()) {
18815 for (
auto &TC : TargetConstraints) {
18829 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18857 return MRI.hasOneNonDBGUse(N0);
18864 if (
I.getMetadata(
"amdgpu.noclobber"))
18866 if (
I.getMetadata(
"amdgpu.last.use"))
18876 if (!Def->isMachineOpcode())
18886 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18887 PhysReg = AMDGPU::SCC;
18889 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18955 Alignment = RMW->getAlign();
18968 bool FullFlatEmulation =
18970 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18971 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18972 RMW->getType()->isDoubleTy()));
18975 bool ReturnValueIsUsed = !AI->
use_empty();
18984 if (FullFlatEmulation) {
18995 std::prev(BB->
end())->eraseFromParent();
18996 Builder.SetInsertPoint(BB);
18998 Value *LoadedShared =
nullptr;
18999 if (FullFlatEmulation) {
19000 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19001 {Addr},
nullptr,
"is.shared");
19002 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19003 Builder.SetInsertPoint(SharedBB);
19004 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19010 LoadedShared = Clone;
19012 Builder.CreateBr(PhiBB);
19013 Builder.SetInsertPoint(CheckPrivateBB);
19016 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19017 {Addr},
nullptr,
"is.private");
19018 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19020 Builder.SetInsertPoint(PrivateBB);
19022 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19025 Value *LoadedPrivate;
19027 LoadedPrivate = Builder.CreateAlignedLoad(
19028 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19031 LoadedPrivate, RMW->getValOperand());
19033 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19035 auto [ResultLoad, Equal] =
19041 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19044 Builder.CreateBr(PhiBB);
19046 Builder.SetInsertPoint(GlobalBB);
19050 if (FullFlatEmulation) {
19051 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19060 if (!FullFlatEmulation) {
19065 MDNode *RangeNotPrivate =
19068 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19072 Builder.CreateBr(PhiBB);
19074 Builder.SetInsertPoint(PhiBB);
19076 if (ReturnValueIsUsed) {
19079 if (FullFlatEmulation)
19086 Builder.CreateBr(ExitBB);
19090 unsigned PtrOpIdx) {
19091 Value *PtrOp =
I->getOperand(PtrOpIdx);
19098 I->setOperand(PtrOpIdx, ASCast);
19110 ConstVal && ConstVal->isNullValue()) {
19140 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19148 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19163 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ TC_RETURN_GFX_WholeWave
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ SMULO
Same for multiplication.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const