35#include "llvm/IR/IntrinsicsAMDGPU.h"
36#include "llvm/IR/IntrinsicsR600.h"
38#define DEBUG_TYPE "amdgpu-legalinfo"
48 "amdgpu-global-isel-new-legality",
49 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
50 "rules compatible with selection patterns"),
65 unsigned Bits = Ty.getSizeInBits();
75 const LLT Ty = Query.Types[TypeIdx];
81 return Ty.getNumElements() % 2 != 0 &&
82 EltSize > 1 && EltSize < 32 &&
83 Ty.getSizeInBits() % 32 != 0;
89 const LLT Ty = Query.Types[TypeIdx];
96 const LLT Ty = Query.Types[TypeIdx];
98 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
104 const LLT Ty = Query.Types[TypeIdx];
106 return std::pair(TypeIdx,
113 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Size = Ty.getSizeInBits();
116 unsigned Pieces = (
Size + 63) / 64;
117 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
127 const LLT Ty = Query.Types[TypeIdx];
130 const int Size = Ty.getSizeInBits();
132 const int NextMul32 = (
Size + 31) / 32;
136 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
144 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
145 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
152 const LLT Ty = Query.Types[TypeIdx];
154 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 assert(EltSize == 32 || EltSize == 64);
162 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
166 return std::pair(TypeIdx,
181 const unsigned NumElems = Ty.getElementCount().getFixedValue();
186 const unsigned Size = Ty.getSizeInBits();
199 const LLT Ty = Query.Types[TypeIdx];
206 const LLT Ty = Query.Types[TypeIdx];
207 unsigned Size = Ty.getSizeInBits();
216 const LLT QueryTy = Query.Types[TypeIdx];
223 const LLT QueryTy = Query.Types[TypeIdx];
230 const LLT QueryTy = Query.Types[TypeIdx];
236 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
242 return EltSize == 16 || EltSize % 32 == 0;
246 const int EltSize = Ty.getElementType().getSizeInBits();
247 return EltSize == 32 || EltSize == 64 ||
248 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
249 EltSize == 128 || EltSize == 256;
278 LLT Ty = Query.Types[TypeIdx];
286 const LLT QueryTy = Query.Types[TypeIdx];
370 if (Ty.isPointerOrPointerVector())
371 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
375 (ST.useRealTrue16Insts() && Ty ==
S16) ||
390 const LLT Ty = Query.Types[TypeIdx];
391 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
392 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
400 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
410 bool IsLoad,
bool IsAtomic) {
414 return ST.enableFlatScratch() ? 128 : 32;
416 return ST.useDS128() ? 128 : 64;
427 return IsLoad ? 512 : 128;
432 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
441 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
443 unsigned RegSize = Ty.getSizeInBits();
446 unsigned AS = Query.
Types[1].getAddressSpace();
453 if (Ty.isVector() && MemSize !=
RegSize)
460 if (IsLoad && MemSize <
Size)
461 MemSize = std::max(MemSize,
Align);
481 if (!ST.hasDwordx3LoadStores())
494 if (AlignBits < MemSize) {
497 Align(AlignBits / 8)))
527 const unsigned Size = Ty.getSizeInBits();
528 if (Ty.isPointerVector())
538 unsigned EltSize = Ty.getScalarSizeInBits();
539 return EltSize != 32 && EltSize != 64;
553 const unsigned Size = Ty.getSizeInBits();
554 if (
Size != MemSizeInBits)
555 return Size <= 32 && Ty.isVector();
561 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
570 uint64_t AlignInBits,
unsigned AddrSpace,
580 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
591 if (AlignInBits < RoundedSize)
598 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
610 Query.
Types[1].getAddressSpace(), Opcode);
630 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
633 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
634 std::array<Register, 4> VectorElems;
635 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
636 for (
unsigned I = 0;
I < NumParts; ++
I)
638 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
639 B.buildMergeValues(MO, VectorElems);
643 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
644 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
645 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
646 B.buildIntToPtr(MO, Scalar);
666 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
667 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
668 for (
unsigned I = 0;
I < NumParts; ++
I)
670 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
672 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
673 return B.buildBitcast(VectorTy, Scalar).getReg(0);
692 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
705 const LLT BufferStridedPtr =
708 const LLT CodePtr = FlatPtr;
710 const std::initializer_list<LLT> AddrSpaces64 = {
711 GlobalPtr, ConstantPtr, FlatPtr
714 const std::initializer_list<LLT> AddrSpaces32 = {
715 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
718 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
720 const std::initializer_list<LLT> FPTypesBase = {
724 const std::initializer_list<LLT> FPTypes16 = {
728 const std::initializer_list<LLT> FPTypesPK16 = {
732 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
753 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
755 if (ST.hasScalarAddSub64()) {
758 .clampMaxNumElementsStrict(0,
S16, 2)
766 .clampMaxNumElementsStrict(0,
S16, 2)
773 if (ST.hasScalarSMulU64()) {
776 .clampMaxNumElementsStrict(0,
S16, 2)
784 .clampMaxNumElementsStrict(0,
S16, 2)
794 .minScalarOrElt(0,
S16)
799 }
else if (ST.has16BitInsts()) {
833 .widenScalarToNextMultipleOf(0, 32)
843 if (ST.hasMad64_32())
848 if (ST.hasIntClamp()) {
871 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
881 if (ST.hasVOP3PInsts()) {
883 .clampMaxNumElements(0,
S8, 2)
904 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
916 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
923 .clampScalar(0,
S16,
S64);
956 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
957 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
964 if (ST.has16BitInsts()) {
965 if (ST.hasVOP3PInsts())
968 FPOpActions.legalFor({
S16});
970 TrigActions.customFor({
S16});
971 FDIVActions.customFor({
S16});
974 if (ST.hasPackedFP32Ops()) {
975 FPOpActions.legalFor({
V2S32});
976 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
980 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
983 if (ST.hasVOP3PInsts()) {
984 MinNumMaxNum.customFor(FPTypesPK16)
986 .clampMaxNumElements(0,
S16, 2)
989 }
else if (ST.has16BitInsts()) {
990 MinNumMaxNum.customFor(FPTypes16)
994 MinNumMaxNum.customFor(FPTypesBase)
999 if (ST.hasVOP3PInsts())
1015 .legalFor(FPTypesPK16)
1020 if (ST.has16BitInsts()) {
1049 if (ST.hasFractBug()) {
1078 if (ST.hasCvtPkF16F32Inst()) {
1080 .clampMaxNumElements(0,
S16, 2);
1084 FPTruncActions.scalarize(0).lower();
1092 if (ST.has16BitInsts()) {
1112 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1113 FMad.customFor({
S32,
S16});
1114 else if (ST.hasMadMacF32Insts())
1115 FMad.customFor({
S32});
1116 else if (ST.hasMadF16())
1117 FMad.customFor({
S16});
1122 if (ST.has16BitInsts()) {
1125 FRem.minScalar(0,
S32)
1134 .clampMaxNumElements(0,
S16, 2)
1153 if (ST.has16BitInsts())
1164 if (ST.has16BitInsts())
1175 .clampScalar(0,
S16,
S64)
1190 .clampScalar(0,
S16,
S64)
1194 if (ST.has16BitInsts()) {
1196 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1198 .clampScalar(0,
S16,
S64)
1202 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1204 .clampScalar(0,
S32,
S64)
1208 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1211 .clampScalar(0,
S32,
S64)
1223 .scalarSameSizeAs(1, 0)
1239 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1240 .legalForCartesianProduct(
1241 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1242 if (ST.has16BitInsts()) {
1243 CmpBuilder.legalFor({{
S1,
S16}});
1254 {
S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1256 if (ST.hasSALUFloatInsts())
1266 if (ST.has16BitInsts())
1267 ExpOps.customFor({{
S32}, {
S16}});
1269 ExpOps.customFor({
S32});
1270 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1279 if (ST.has16BitInsts())
1295 .clampScalar(0,
S32,
S32)
1302 if (ST.has16BitInsts())
1305 .widenScalarToNextPow2(1)
1311 .lowerFor({
S1,
S16})
1312 .widenScalarToNextPow2(1)
1339 .clampScalar(0,
S32,
S32)
1349 .clampScalar(0,
S32,
S64)
1353 if (ST.has16BitInsts()) {
1356 .clampMaxNumElementsStrict(0,
S16, 2)
1363 if (ST.hasVOP3PInsts()) {
1366 .clampMaxNumElements(0,
S16, 2)
1371 if (ST.hasIntMinMax64()) {
1374 .clampMaxNumElements(0,
S16, 2)
1382 .clampMaxNumElements(0,
S16, 2)
1391 .widenScalarToNextPow2(0)
1419 .legalForCartesianProduct(AddrSpaces32, {
S32})
1435 .legalForCartesianProduct(AddrSpaces32, {
S32})
1452 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1453 bool IsLoad) ->
bool {
1457 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1471 unsigned NumRegs = (MemSize + 31) / 32;
1473 if (!ST.hasDwordx3LoadStores())
1484 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1485 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1486 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1492 for (
unsigned Op : {G_LOAD, G_STORE}) {
1493 const bool IsStore =
Op == G_STORE;
1498 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1501 {
S64, GlobalPtr,
S64, GlobalAlign32},
1504 {
S32, GlobalPtr,
S8, GlobalAlign8},
1505 {
S32, GlobalPtr,
S16, GlobalAlign16},
1507 {
S32, LocalPtr,
S32, 32},
1508 {
S64, LocalPtr,
S64, 32},
1510 {
S32, LocalPtr,
S8, 8},
1511 {
S32, LocalPtr,
S16, 16},
1514 {
S32, PrivatePtr,
S32, 32},
1515 {
S32, PrivatePtr,
S8, 8},
1516 {
S32, PrivatePtr,
S16, 16},
1519 {
S32, ConstantPtr,
S32, GlobalAlign32},
1522 {
S64, ConstantPtr,
S64, GlobalAlign32},
1523 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1532 Actions.unsupportedIf(
1533 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1547 Actions.customIf(
typeIs(1, Constant32Ptr));
1573 return !Query.
Types[0].isVector() &&
1574 needToSplitMemOp(Query,
Op == G_LOAD);
1576 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1581 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1584 if (DstSize > MemSize)
1590 if (MemSize > MaxSize)
1598 return Query.
Types[0].isVector() &&
1599 needToSplitMemOp(Query,
Op == G_LOAD);
1601 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1615 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1616 if (MemSize > MaxSize) {
1620 if (MaxSize % EltSize == 0) {
1626 unsigned NumPieces = MemSize / MaxSize;
1630 if (NumPieces == 1 || NumPieces >= NumElts ||
1631 NumElts % NumPieces != 0)
1632 return std::pair(0, EltTy);
1640 return std::pair(0, EltTy);
1655 return std::pair(0, EltTy);
1660 .widenScalarToNextPow2(0)
1667 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1668 {
S32, GlobalPtr,
S16, 2 * 8},
1669 {
S32, LocalPtr,
S8, 8},
1670 {
S32, LocalPtr,
S16, 16},
1671 {
S32, PrivatePtr,
S8, 8},
1672 {
S32, PrivatePtr,
S16, 16},
1673 {
S32, ConstantPtr,
S8, 8},
1674 {
S32, ConstantPtr,
S16, 2 * 8}})
1680 if (ST.hasFlatAddressSpace()) {
1681 ExtLoads.legalForTypesWithMemDesc(
1682 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1697 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1698 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1699 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1700 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1701 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1702 {
S64, GlobalPtr}, {
S64, LocalPtr},
1703 {
S32, RegionPtr}, {
S64, RegionPtr}});
1704 if (ST.hasFlatAddressSpace()) {
1705 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1710 if (ST.hasLDSFPAtomicAddF32()) {
1711 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1712 if (ST.hasLdsAtomicAddF64())
1713 Atomic.legalFor({{
S64, LocalPtr}});
1714 if (ST.hasAtomicDsPkAdd16Insts())
1715 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1717 if (ST.hasAtomicFaddInsts())
1718 Atomic.legalFor({{
S32, GlobalPtr}});
1719 if (ST.hasFlatAtomicFaddF32Inst())
1720 Atomic.legalFor({{
S32, FlatPtr}});
1722 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1733 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1734 ST.hasAtomicBufferGlobalPkAddF16Insts())
1735 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1736 if (ST.hasAtomicGlobalPkAddBF16Inst())
1737 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1738 if (ST.hasAtomicFlatPkAdd16Insts())
1739 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1744 auto &AtomicFMinFMax =
1746 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1748 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1750 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1751 AtomicFMinFMax.
legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1752 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1754 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1761 {
S32, FlatPtr}, {
S64, FlatPtr}})
1762 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1763 {
S32, RegionPtr}, {
S64, RegionPtr}});
1769 LocalPtr, FlatPtr, PrivatePtr,
1773 .clampScalar(0,
S16,
S64)
1788 if (ST.has16BitInsts()) {
1789 if (ST.hasVOP3PInsts()) {
1791 .clampMaxNumElements(0,
S16, 2);
1793 Shifts.legalFor({{
S16,
S16}});
1796 Shifts.widenScalarIf(
1801 const LLT AmountTy = Query.
Types[1];
1802 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1806 Shifts.clampScalar(1,
S32,
S32);
1807 Shifts.widenScalarToNextPow2(0, 16);
1808 Shifts.clampScalar(0,
S16,
S64);
1818 Shifts.clampScalar(1,
S32,
S32);
1819 Shifts.widenScalarToNextPow2(0, 32);
1820 Shifts.clampScalar(0,
S32,
S64);
1829 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1830 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1831 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1832 unsigned IdxTypeIdx = 2;
1836 const LLT EltTy = Query.
Types[EltTypeIdx];
1837 const LLT VecTy = Query.
Types[VecTypeIdx];
1838 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1840 const bool isLegalVecType =
1850 return (EltSize == 32 || EltSize == 64) &&
1866 const LLT EltTy = Query.
Types[EltTypeIdx];
1867 const LLT VecTy = Query.
Types[VecTypeIdx];
1871 const unsigned TargetEltSize =
1872 DstEltSize % 64 == 0 ? 64 : 32;
1873 return std::pair(VecTypeIdx,
1877 .clampScalar(EltTypeIdx,
S32,
S64)
1891 const LLT &EltTy = Query.
Types[1].getElementType();
1892 return Query.
Types[0] != EltTy;
1895 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1896 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1897 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1906 const LLT BigTy = Query.
Types[BigTyIdx];
1911 const LLT BigTy = Query.
Types[BigTyIdx];
1912 const LLT LitTy = Query.
Types[LitTyIdx];
1918 const LLT BigTy = Query.
Types[BigTyIdx];
1924 const LLT LitTy = Query.
Types[LitTyIdx];
1943 if (ST.hasScalarPackInsts()) {
1946 .minScalarOrElt(0,
S16)
1953 BuildVector.customFor({
V2S16,
S16});
1954 BuildVector.minScalarOrElt(0,
S32);
1973 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1974 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1975 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1977 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1978 const LLT Ty = Query.
Types[TypeIdx];
1979 if (Ty.isVector()) {
1994 const LLT BigTy = Query.
Types[BigTyIdx];
2014 return notValidElt(Query, LitTyIdx);
2019 return notValidElt(Query, BigTyIdx);
2024 if (
Op == G_MERGE_VALUES) {
2025 Builder.widenScalarIf(
2028 const LLT Ty = Query.
Types[LitTyIdx];
2029 return Ty.getSizeInBits() < 32;
2036 const LLT Ty = Query.
Types[BigTyIdx];
2037 return Ty.getSizeInBits() % 16 != 0;
2042 const LLT &Ty = Query.
Types[BigTyIdx];
2043 unsigned NewSizeInBits = 1 <<
Log2_32_Ceil(Ty.getSizeInBits() + 1);
2044 if (NewSizeInBits >= 256) {
2045 unsigned RoundedTo =
alignTo<64>(Ty.getSizeInBits() + 1);
2046 if (RoundedTo < NewSizeInBits)
2047 NewSizeInBits = RoundedTo;
2049 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2060 .clampScalar(0,
S32,
S64);
2062 if (ST.hasVOP3PInsts()) {
2063 SextInReg.lowerFor({{
V2S16}})
2067 .clampMaxNumElementsStrict(0,
S16, 2);
2068 }
else if (ST.has16BitInsts()) {
2069 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2073 SextInReg.lowerFor({{
S32}, {
S64}});
2086 FSHRActionDefs.legalFor({{
S32,
S32}})
2087 .clampMaxNumElementsStrict(0,
S16, 2);
2088 if (ST.hasVOP3PInsts())
2090 FSHRActionDefs.scalarize(0).lower();
2092 if (ST.hasVOP3PInsts()) {
2095 .clampMaxNumElementsStrict(0,
S16, 2)
2119 .clampScalar(1,
S32,
S32)
2128 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2129 G_READ_REGISTER, G_WRITE_REGISTER,
2134 if (ST.hasIEEEMinimumMaximumInsts()) {
2136 .legalFor(FPTypesPK16)
2150 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2151 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2157 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2158 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2159 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2160 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2166 verify(*ST.getInstrInfo());
2175 switch (
MI.getOpcode()) {
2176 case TargetOpcode::G_ADDRSPACE_CAST:
2178 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2180 case TargetOpcode::G_FCEIL:
2182 case TargetOpcode::G_FREM:
2184 case TargetOpcode::G_INTRINSIC_TRUNC:
2186 case TargetOpcode::G_SITOFP:
2188 case TargetOpcode::G_UITOFP:
2190 case TargetOpcode::G_FPTOSI:
2192 case TargetOpcode::G_FPTOUI:
2194 case TargetOpcode::G_FMINNUM:
2195 case TargetOpcode::G_FMAXNUM:
2196 case TargetOpcode::G_FMINIMUMNUM:
2197 case TargetOpcode::G_FMAXIMUMNUM:
2198 case TargetOpcode::G_FMINNUM_IEEE:
2199 case TargetOpcode::G_FMAXNUM_IEEE:
2201 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2203 case TargetOpcode::G_INSERT_VECTOR_ELT:
2205 case TargetOpcode::G_FSIN:
2206 case TargetOpcode::G_FCOS:
2208 case TargetOpcode::G_GLOBAL_VALUE:
2210 case TargetOpcode::G_LOAD:
2211 case TargetOpcode::G_SEXTLOAD:
2212 case TargetOpcode::G_ZEXTLOAD:
2214 case TargetOpcode::G_STORE:
2216 case TargetOpcode::G_FMAD:
2218 case TargetOpcode::G_FDIV:
2220 case TargetOpcode::G_FFREXP:
2222 case TargetOpcode::G_FSQRT:
2224 case TargetOpcode::G_UDIV:
2225 case TargetOpcode::G_UREM:
2226 case TargetOpcode::G_UDIVREM:
2228 case TargetOpcode::G_SDIV:
2229 case TargetOpcode::G_SREM:
2230 case TargetOpcode::G_SDIVREM:
2232 case TargetOpcode::G_ATOMIC_CMPXCHG:
2234 case TargetOpcode::G_FLOG2:
2236 case TargetOpcode::G_FLOG:
2237 case TargetOpcode::G_FLOG10:
2239 case TargetOpcode::G_FEXP2:
2241 case TargetOpcode::G_FEXP:
2242 case TargetOpcode::G_FEXP10:
2244 case TargetOpcode::G_FPOW:
2246 case TargetOpcode::G_FFLOOR:
2248 case TargetOpcode::G_BUILD_VECTOR:
2249 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2251 case TargetOpcode::G_MUL:
2253 case TargetOpcode::G_CTLZ:
2254 case TargetOpcode::G_CTTZ:
2256 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2258 case TargetOpcode::G_STACKSAVE:
2260 case TargetOpcode::G_GET_FPENV:
2262 case TargetOpcode::G_SET_FPENV:
2264 case TargetOpcode::G_TRAP:
2266 case TargetOpcode::G_DEBUGTRAP:
2286 if (ST.hasApertureRegs()) {
2291 ? AMDGPU::SRC_SHARED_BASE
2292 : AMDGPU::SRC_PRIVATE_BASE;
2293 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2294 !ST.hasGloballyAddressableScratch()) &&
2295 "Cannot use src_private_base with globally addressable scratch!");
2297 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2298 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2299 return B.buildUnmerge(
S32, Dst).getReg(1);
2304 Register LoadAddr =
MRI.createGenericVirtualRegister(
2314 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2316 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2330 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2333 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2336 Register QueuePtr =
MRI.createGenericVirtualRegister(
2352 B.buildObjectPtrOffset(
2354 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2355 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2363 switch (Def->getOpcode()) {
2364 case AMDGPU::G_FRAME_INDEX:
2365 case AMDGPU::G_GLOBAL_VALUE:
2366 case AMDGPU::G_BLOCK_ADDR:
2368 case AMDGPU::G_CONSTANT: {
2369 const ConstantInt *CI = Def->getOperand(1).getCImm();
2370 return CI->
getSExtValue() != TM.getNullPointerValue(AddrSpace);
2386 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2388 Intrinsic::amdgcn_addrspacecast_nonnull));
2393 :
MI.getOperand(1).getReg();
2394 LLT DstTy =
MRI.getType(Dst);
2395 LLT SrcTy =
MRI.getType(Src);
2397 unsigned SrcAS = SrcTy.getAddressSpace();
2406 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2407 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2414 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2416 ST.hasGloballyAddressableScratch()) {
2420 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2422 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2423 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2425 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2427 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2431 return B.buildExtract(Dst, Src, 0).getReg(0);
2437 castFlatToLocalOrPrivate(Dst);
2438 MI.eraseFromParent();
2442 unsigned NullVal = TM.getNullPointerValue(DestAS);
2444 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2445 auto FlatNull =
B.buildConstant(SrcTy, 0);
2448 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2452 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2454 MI.eraseFromParent();
2461 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2464 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2467 ST.hasGloballyAddressableScratch()) {
2472 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2476 if (ST.isWave64()) {
2477 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2483 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2484 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2486 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2490 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2491 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2493 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2494 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2503 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2509 castLocalOrPrivateToFlat(Dst);
2510 MI.eraseFromParent();
2514 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2516 auto SegmentNull =
B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2517 auto FlatNull =
B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2520 SegmentNull.getReg(0));
2522 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2524 MI.eraseFromParent();
2529 SrcTy.getSizeInBits() == 64) {
2531 B.buildExtract(Dst, Src, 0);
2532 MI.eraseFromParent();
2539 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2540 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2541 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2542 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2543 MI.eraseFromParent();
2550 MI.eraseFromParent();
2558 LLT Ty =
MRI.getType(Src);
2559 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2564 auto C1 =
B.buildFConstant(Ty, C1Val);
2565 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2568 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2569 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2571 auto C2 =
B.buildFConstant(Ty, C2Val);
2572 auto Fabs =
B.buildFAbs(Ty, Src);
2575 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2576 MI.eraseFromParent();
2594 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2596 const auto Zero =
B.buildFConstant(
S64, 0.0);
2597 const auto One =
B.buildFConstant(
S64, 1.0);
2600 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2601 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2604 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2605 MI.eraseFromParent();
2613 Register Src0Reg =
MI.getOperand(1).getReg();
2614 Register Src1Reg =
MI.getOperand(2).getReg();
2615 auto Flags =
MI.getFlags();
2616 LLT Ty =
MRI.getType(DstReg);
2618 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2619 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2620 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2621 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2622 MI.eraseFromParent();
2628 const unsigned FractBits = 52;
2629 const unsigned ExpBits = 11;
2632 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2633 auto Const1 =
B.buildConstant(
S32, ExpBits);
2635 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2637 .addUse(Const0.getReg(0))
2638 .addUse(Const1.getReg(0));
2640 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2654 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2661 const unsigned FractBits = 52;
2664 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2665 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2667 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2669 const auto Zero32 =
B.buildConstant(
S32, 0);
2672 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2674 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2675 auto Not =
B.buildNot(
S64, Shr);
2676 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2677 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2682 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2683 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2684 MI.eraseFromParent();
2700 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2701 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2703 if (
MRI.getType(Dst) ==
S64) {
2704 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2705 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2707 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2708 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2711 B.buildFAdd(Dst, LdExp, CvtLo);
2712 MI.eraseFromParent();
2718 auto One =
B.buildConstant(
S32, 1);
2722 auto ThirtyOne =
B.buildConstant(
S32, 31);
2723 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2724 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2725 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2726 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2727 .addUse(Unmerge.getReg(1));
2728 auto LS2 =
B.buildSub(
S32, LS, One);
2729 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2731 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2732 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2733 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2734 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2735 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2736 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2737 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2738 B.buildFLdexp(Dst, FVal, Scale);
2739 MI.eraseFromParent();
2756 const LLT SrcLT =
MRI.getType(Src);
2759 unsigned Flags =
MI.getFlags();
2770 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2778 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2779 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2783 K0 =
B.buildFConstant(
2785 K1 =
B.buildFConstant(
2788 K0 =
B.buildFConstant(
2790 K1 =
B.buildFConstant(
2794 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2795 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2796 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2799 :
B.buildFPTOUI(
S32, FloorMul);
2800 auto Lo =
B.buildFPTOUI(
S32, Fma);
2804 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2806 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2809 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2810 MI.eraseFromParent();
2820 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2821 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2829 if (
MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
2830 MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
2852 LLT VecTy =
MRI.getType(Vec);
2865 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2866 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2867 B.buildIntToPtr(Dst, IntElt);
2869 MI.eraseFromParent();
2876 std::optional<ValueAndVReg> MaybeIdxVal =
2880 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2883 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2884 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2889 MI.eraseFromParent();
2904 LLT VecTy =
MRI.getType(Vec);
2918 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2919 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2920 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2922 B.buildIntToPtr(Dst, IntVecDest);
2923 MI.eraseFromParent();
2930 std::optional<ValueAndVReg> MaybeIdxVal =
2935 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2938 if (IdxVal < NumElts) {
2940 for (
unsigned i = 0; i < NumElts; ++i)
2941 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2942 B.buildUnmerge(SrcRegs, Vec);
2944 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2945 B.buildMergeLikeInstr(Dst, SrcRegs);
2950 MI.eraseFromParent();
2960 LLT Ty =
MRI.getType(DstReg);
2961 unsigned Flags =
MI.getFlags();
2965 if (ST.hasTrigReducedRange()) {
2966 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2967 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2968 .addUse(MulVal.getReg(0))
2972 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2975 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2979 MI.eraseFromParent();
2987 unsigned GAFlags)
const {
3016 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3018 if (ST.has64BitLiterals()) {
3022 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3026 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3035 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3036 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3039 B.buildExtract(DstReg, PCReg, 0);
3049 if (RequiresHighHalf && ST.has64BitLiterals()) {
3050 if (!
MRI.getRegClassOrNull(DstReg))
3051 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3052 B.buildInstr(AMDGPU::S_MOV_B64)
3062 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
3064 :
MRI.createGenericVirtualRegister(
S32);
3066 if (!
MRI.getRegClassOrNull(AddrLo))
3067 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3070 B.buildInstr(AMDGPU::S_MOV_B32)
3075 if (RequiresHighHalf) {
3077 "Must provide a 64-bit pointer type!");
3080 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3082 B.buildInstr(AMDGPU::S_MOV_B32)
3092 if (!
MRI.getRegClassOrNull(AddrDst))
3093 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3095 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3099 if (AddrDst != DstReg)
3100 B.buildCast(DstReg, AddrDst);
3101 }
else if (AddrLo != DstReg) {
3104 B.buildCast(DstReg, AddrLo);
3112 LLT Ty =
MRI.getType(DstReg);
3113 unsigned AS = Ty.getAddressSpace();
3121 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3125 Fn,
"local memory global used by non-kernel function",
3134 B.buildUndef(DstReg);
3135 MI.eraseFromParent();
3155 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3159 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3160 B.buildIntToPtr(DstReg, Sz);
3161 MI.eraseFromParent();
3168 MI.eraseFromParent();
3172 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3174 MI.eraseFromParent();
3182 MI.eraseFromParent();
3188 MI.eraseFromParent();
3193 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3204 if (Ty.getSizeInBits() == 32) {
3206 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3207 B.buildExtract(DstReg, Load, 0);
3209 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3211 MI.eraseFromParent();
3229 LLT PtrTy =
MRI.getType(PtrReg);
3234 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3236 MI.getOperand(1).setReg(Cast.getReg(0));
3241 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3245 LLT ValTy =
MRI.getType(ValReg);
3255 const unsigned ValSize = ValTy.getSizeInBits();
3267 if (WideMemSize == ValSize) {
3273 MI.setMemRefs(MF, {WideMMO});
3279 if (ValSize > WideMemSize)
3286 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3287 B.buildTrunc(ValReg, WideLoad).getReg(0);
3294 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3295 B.buildExtract(ValReg, WideLoad, 0);
3299 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3300 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3304 MI.eraseFromParent();
3317 Register DataReg =
MI.getOperand(0).getReg();
3318 LLT DataTy =
MRI.getType(DataReg);
3332 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3361 "this should not have been custom lowered");
3363 LLT ValTy =
MRI.getType(CmpVal);
3366 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3368 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3372 .setMemRefs(
MI.memoperands());
3374 MI.eraseFromParent();
3382 switch (
DefMI->getOpcode()) {
3383 case TargetOpcode::G_INTRINSIC: {
3385 case Intrinsic::amdgcn_frexp_mant:
3393 case TargetOpcode::G_FFREXP: {
3394 if (
DefMI->getOperand(0).getReg() == Src)
3398 case TargetOpcode::G_FPEXT: {
3419std::pair<Register, Register>
3421 unsigned Flags)
const {
3426 auto SmallestNormal =
B.buildFConstant(
3428 auto IsLtSmallestNormal =
3431 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3432 auto One =
B.buildFConstant(
F32, 1.0);
3434 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3435 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3437 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3450 LLT Ty =
B.getMRI()->getType(Dst);
3451 unsigned Flags =
MI.getFlags();
3456 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3457 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3458 .addUse(Ext.getReg(0))
3460 B.buildFPTrunc(Dst,
Log2, Flags);
3461 MI.eraseFromParent();
3469 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3472 MI.eraseFromParent();
3476 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3477 .addUse(ScaledInput)
3480 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3481 auto Zero =
B.buildFConstant(Ty, 0.0);
3483 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3484 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3486 MI.eraseFromParent();
3492 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3493 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3498 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3499 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3504 unsigned Flags =
MI.getFlags();
3505 const LLT Ty =
MRI.getType(
X);
3515 if (Ty == F16 && !ST.has16BitInsts()) {
3517 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3519 B.buildFPTrunc(Dst, LogVal);
3524 MI.eraseFromParent();
3533 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3536 if (ST.hasFastFMAF32()) {
3538 const float c_log10 = 0x1.344134p-2f;
3539 const float cc_log10 = 0x1.09f79ep-26f;
3542 const float c_log = 0x1.62e42ep-1f;
3543 const float cc_log = 0x1.efa39ep-25f;
3545 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3546 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3548 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3549 auto NegR =
B.buildFNeg(Ty, R, Flags);
3550 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3551 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, Flags);
3552 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3555 const float ch_log10 = 0x1.344000p-2f;
3556 const float ct_log10 = 0x1.3509f6p-18f;
3559 const float ch_log = 0x1.62e000p-1f;
3560 const float ct_log = 0x1.0bfbe8p-15f;
3562 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3563 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3565 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3566 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3567 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3568 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3571 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3573 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3576 const bool IsFiniteOnly =
3580 if (!IsFiniteOnly) {
3583 auto Fabs =
B.buildFAbs(Ty,
Y);
3586 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3590 auto Zero =
B.buildFConstant(Ty, 0.0);
3592 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3593 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3594 B.buildFSub(Dst, R, Shift, Flags);
3596 B.buildCopy(Dst, R);
3599 MI.eraseFromParent();
3605 unsigned Flags)
const {
3606 const double Log2BaseInverted =
3609 LLT Ty =
B.getMRI()->getType(Dst);
3614 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3617 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3618 auto Zero =
B.buildFConstant(Ty, 0.0);
3620 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3621 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3623 if (ST.hasFastFMAF32())
3624 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3626 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3627 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3635 ?
B.buildFLog2(Ty, Src, Flags)
3636 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3639 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3640 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3651 unsigned Flags =
MI.getFlags();
3652 LLT Ty =
B.getMRI()->getType(Dst);
3658 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3659 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3660 .addUse(Ext.getReg(0))
3662 B.buildFPTrunc(Dst,
Log2, Flags);
3663 MI.eraseFromParent();
3673 MI.eraseFromParent();
3681 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3683 RangeCheckConst, Flags);
3685 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3686 auto Zero =
B.buildFConstant(Ty, 0.0);
3687 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3688 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3690 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3691 .addUse(AddInput.getReg(0))
3694 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3695 auto One =
B.buildFConstant(Ty, 1.0);
3696 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3697 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3698 MI.eraseFromParent();
3704 LLT Ty =
B.getMRI()->getType(Dst);
3709 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3713 .addUse(
Mul.getReg(0))
3716 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3722 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3725 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3726 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3727 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3730 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3732 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3733 .addUse(ExpInput.getReg(0))
3736 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3737 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3738 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3746 const unsigned Flags =
MI.getFlags();
3749 LLT Ty =
MRI.getType(Dst);
3752 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3759 MI.eraseFromParent();
3767 auto Ext =
B.buildFPExt(
F32,
X, Flags);
3770 B.buildFPTrunc(Dst, Lowered, Flags);
3771 MI.eraseFromParent();
3781 MI.eraseFromParent();
3809 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3812 if (ST.hasFastFMAF32()) {
3814 const float cc_exp = 0x1.4ae0bep-26f;
3815 const float c_exp10 = 0x1.a934f0p+1f;
3816 const float cc_exp10 = 0x1.2f346ep-24f;
3818 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3819 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3820 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3821 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3823 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3824 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
3826 const float ch_exp = 0x1.714000p+0f;
3827 const float cl_exp = 0x1.47652ap-12f;
3829 const float ch_exp10 = 0x1.a92000p+1f;
3830 const float cl_exp10 = 0x1.4f0978p-11f;
3832 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3833 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3834 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3836 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3837 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3839 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3840 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3843 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3844 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3847 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3850 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3851 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3854 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3855 .addUse(
A.getReg(0))
3857 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3859 auto UnderflowCheckConst =
3860 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3861 auto Zero =
B.buildFConstant(Ty, 0.0);
3865 R =
B.buildSelect(Ty, Underflow, Zero, R);
3870 auto OverflowCheckConst =
3871 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3876 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3879 B.buildCopy(Dst, R);
3880 MI.eraseFromParent();
3889 unsigned Flags =
MI.getFlags();
3890 LLT Ty =
B.getMRI()->getType(Dst);
3895 auto Log =
B.buildFLog2(
F32, Src0, Flags);
3896 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3897 .addUse(Log.getReg(0))
3900 B.buildFExp2(Dst,
Mul, Flags);
3901 }
else if (Ty == F16) {
3903 auto Log =
B.buildFLog2(F16, Src0, Flags);
3904 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
3905 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
3906 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3907 .addUse(Ext0.getReg(0))
3908 .addUse(Ext1.getReg(0))
3910 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3914 MI.eraseFromParent();
3922 ModSrc = SrcFNeg->getOperand(1).getReg();
3924 ModSrc = SrcFAbs->getOperand(1).getReg();
3926 ModSrc = SrcFAbs->getOperand(1).getReg();
3937 Register OrigSrc =
MI.getOperand(1).getReg();
3938 unsigned Flags =
MI.getFlags();
3940 "this should not have been custom lowered");
3950 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
3970 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3972 B.buildFMinNum(Min, Fract, Const, Flags);
3977 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
3980 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
3981 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3983 MI.eraseFromParent();
3999 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4001 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4002 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4005 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4006 B.buildBitcast(Dst,
Merge);
4008 MI.eraseFromParent();
4025 bool UsePartialMad64_32,
4026 bool SeparateOddAlignedProducts)
const {
4041 auto getZero32 = [&]() ->
Register {
4043 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4046 auto getZero64 = [&]() ->
Register {
4048 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4053 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4064 if (CarryIn.empty())
4067 bool HaveCarryOut =
true;
4069 if (CarryIn.size() == 1) {
4071 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4075 CarryAccum = getZero32();
4077 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4078 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4080 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4085 LocalAccum = getZero32();
4086 HaveCarryOut =
false;
4091 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4092 LocalAccum =
Add.getReg(0);
4106 auto buildMadChain =
4109 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4110 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4117 if (LocalAccum.size() == 1 &&
4118 (!UsePartialMad64_32 || !CarryIn.empty())) {
4121 unsigned j1 = DstIndex - j0;
4122 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4126 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4128 LocalAccum[0] =
Mul.getReg(0);
4130 if (CarryIn.empty()) {
4131 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4134 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4140 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4144 if (j0 <= DstIndex) {
4145 bool HaveSmallAccum =
false;
4148 if (LocalAccum[0]) {
4149 if (LocalAccum.size() == 1) {
4150 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4151 HaveSmallAccum =
true;
4152 }
else if (LocalAccum[1]) {
4153 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4154 HaveSmallAccum =
false;
4156 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4157 HaveSmallAccum =
true;
4160 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4162 HaveSmallAccum =
true;
4166 unsigned j1 = DstIndex - j0;
4167 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4171 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4172 {Src0[j0], Src1[j1], Tmp});
4173 Tmp = Mad.getReg(0);
4174 if (!HaveSmallAccum)
4175 CarryOut.push_back(Mad.getReg(1));
4176 HaveSmallAccum =
false;
4179 }
while (j0 <= DstIndex);
4181 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4182 LocalAccum[0] = Unmerge.getReg(0);
4183 if (LocalAccum.size() > 1)
4184 LocalAccum[1] = Unmerge.getReg(1);
4211 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4212 Carry OddCarryIn = std::move(OddCarry);
4213 Carry EvenCarryIn = std::move(EvenCarry);
4218 if (2 * i < Accum.
size()) {
4219 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4220 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4225 if (!SeparateOddAlignedProducts) {
4226 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4227 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4229 bool IsHighest = 2 * i >= Accum.
size();
4232 .take_front(IsHighest ? 1 : 2);
4233 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4239 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4241 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4243 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4246 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4249 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4250 Lo->getOperand(1).getReg());
4251 Accum[2 * i] =
Hi.getReg(0);
4252 SeparateOddCarry =
Hi.getReg(1);
4259 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4260 EvenCarryIn.push_back(CarryOut);
4262 if (2 * i < Accum.
size()) {
4263 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4264 OddCarry.push_back(CarryOut);
4276 assert(ST.hasMad64_32());
4277 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4286 LLT Ty =
MRI.getType(DstReg);
4289 unsigned Size = Ty.getSizeInBits();
4290 if (ST.hasVectorMulU64() &&
Size == 64)
4293 unsigned NumParts =
Size / 32;
4305 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4309 for (
unsigned i = 0; i < NumParts; ++i) {
4313 B.buildUnmerge(Src0Parts, Src0);
4314 B.buildUnmerge(Src1Parts, Src1);
4317 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4318 SeparateOddAlignedProducts);
4320 B.buildMergeLikeInstr(DstReg, AccumRegs);
4321 MI.eraseFromParent();
4333 LLT DstTy =
MRI.getType(Dst);
4334 LLT SrcTy =
MRI.getType(Src);
4336 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4337 ? AMDGPU::G_AMDGPU_FFBH_U32
4338 : AMDGPU::G_AMDGPU_FFBL_B32;
4339 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4342 MI.eraseFromParent();
4351 LLT SrcTy =
MRI.getType(Src);
4352 TypeSize NumBits = SrcTy.getSizeInBits();
4356 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4357 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4358 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4359 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4360 B.buildTrunc(Dst, Ctlz);
4361 MI.eraseFromParent();
4367 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4370 return ConstVal == -1;
4377 Register CondDef =
MI.getOperand(0).getReg();
4378 if (!
MRI.hasOneNonDBGUse(CondDef))
4386 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4392 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4396 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4405 UncondBrTarget = &*NextMBB;
4407 if (
Next->getOpcode() != AMDGPU::G_BR)
4426 *ArgRC,
B.getDebugLoc(), ArgTy);
4430 const unsigned Mask = Arg->
getMask();
4438 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4439 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4442 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4444 B.buildCopy(DstReg, LiveIn);
4454 if (!ST.hasClusters()) {
4457 MI.eraseFromParent();
4470 Register ClusterMaxIdXYZ =
MRI.createGenericVirtualRegister(
S32);
4471 Register ClusterWorkGroupIdXYZ =
MRI.createGenericVirtualRegister(
S32);
4477 auto One =
B.buildConstant(
S32, 1);
4478 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4479 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4480 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4487 B.buildCopy(DstReg, GlobalIdXYZ);
4488 MI.eraseFromParent();
4492 B.buildCopy(DstReg, ClusterIdXYZ);
4493 MI.eraseFromParent();
4498 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4500 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4501 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4503 .addImm(ClusterIdField);
4504 auto Zero =
B.buildConstant(
S32, 0);
4507 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4508 MI.eraseFromParent();
4550 auto LoadConstant = [&](
unsigned N) {
4551 B.buildConstant(DstReg,
N);
4555 if (ST.hasArchitectedSGPRs() &&
4562 Arg = &WorkGroupIDX;
4563 ArgRC = &AMDGPU::SReg_32RegClass;
4567 Arg = &WorkGroupIDY;
4568 ArgRC = &AMDGPU::SReg_32RegClass;
4572 Arg = &WorkGroupIDZ;
4573 ArgRC = &AMDGPU::SReg_32RegClass;
4577 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4578 return LoadConstant(0);
4579 Arg = &ClusterWorkGroupIDX;
4580 ArgRC = &AMDGPU::SReg_32RegClass;
4584 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4585 return LoadConstant(0);
4586 Arg = &ClusterWorkGroupIDY;
4587 ArgRC = &AMDGPU::SReg_32RegClass;
4591 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4592 return LoadConstant(0);
4593 Arg = &ClusterWorkGroupIDZ;
4594 ArgRC = &AMDGPU::SReg_32RegClass;
4599 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4600 Arg = &ClusterWorkGroupMaxIDX;
4601 ArgRC = &AMDGPU::SReg_32RegClass;
4606 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4607 Arg = &ClusterWorkGroupMaxIDY;
4608 ArgRC = &AMDGPU::SReg_32RegClass;
4613 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4614 Arg = &ClusterWorkGroupMaxIDZ;
4615 ArgRC = &AMDGPU::SReg_32RegClass;
4619 Arg = &ClusterWorkGroupMaxFlatID;
4620 ArgRC = &AMDGPU::SReg_32RegClass;
4635 return LoadConstant(0);
4640 B.buildUndef(DstReg);
4644 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4656 MI.eraseFromParent();
4662 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4663 MI.eraseFromParent();
4670 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
4684 B.buildUndef(DstReg);
4685 MI.eraseFromParent();
4689 if (Arg->isMasked()) {
4703 MI.eraseFromParent();
4710 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4719 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4727 Align Alignment)
const {
4731 "unexpected kernarg parameter type");
4735 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4738 MI.eraseFromParent();
4746 LLT DstTy =
MRI.getType(Dst);
4773 auto FloatY =
B.buildUITOFP(
S32,
Y);
4774 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
4776 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
4777 auto Z =
B.buildFPTOUI(
S32, ScaledY);
4780 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
4781 auto NegYZ =
B.buildMul(
S32, NegY, Z);
4782 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
4785 auto Q =
B.buildUMulH(
S32,
X, Z);
4786 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
4789 auto One =
B.buildConstant(
S32, 1);
4792 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
4798 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
4801 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
4820 auto Unmerge =
B.buildUnmerge(
S32, Val);
4822 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
4823 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
4825 auto Mad =
B.buildFMAD(
4829 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
4830 auto Mul1 =
B.buildFMul(
4834 auto Mul2 =
B.buildFMul(
4836 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
4839 auto Mad2 =
B.buildFMAD(
4843 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
4844 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
4846 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4861 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
4863 auto Zero64 =
B.buildConstant(
S64, 0);
4864 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
4866 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
4867 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
4869 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
4870 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4871 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4873 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
4874 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4875 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
4877 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
4878 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
4879 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
4880 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4881 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4883 auto Zero32 =
B.buildConstant(
S32, 0);
4884 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
4885 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4886 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
4888 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
4889 Register NumerLo = UnmergeNumer.getReg(0);
4890 Register NumerHi = UnmergeNumer.getReg(1);
4892 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
4893 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
4894 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
4895 Register Mul3_Lo = UnmergeMul3.getReg(0);
4896 Register Mul3_Hi = UnmergeMul3.getReg(1);
4897 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
4898 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4899 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
4900 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
4902 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
4903 Register DenomLo = UnmergeDenom.getReg(0);
4904 Register DenomHi = UnmergeDenom.getReg(1);
4907 auto C1 =
B.buildSExt(
S32, CmpHi);
4910 auto C2 =
B.buildSExt(
S32, CmpLo);
4913 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
4920 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
4921 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4922 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4923 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
4925 auto One64 =
B.buildConstant(
S64, 1);
4926 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
4932 auto C6 =
B.buildSelect(
4936 auto Add4 =
B.buildAdd(
S64, Add3, One64);
4937 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
4939 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4940 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4941 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
4947 auto Sel1 =
B.buildSelect(
4954 auto Sel2 =
B.buildSelect(
4965 switch (
MI.getOpcode()) {
4968 case AMDGPU::G_UDIV: {
4969 DstDivReg =
MI.getOperand(0).getReg();
4972 case AMDGPU::G_UREM: {
4973 DstRemReg =
MI.getOperand(0).getReg();
4976 case AMDGPU::G_UDIVREM: {
4977 DstDivReg =
MI.getOperand(0).getReg();
4978 DstRemReg =
MI.getOperand(1).getReg();
4985 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4986 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
4987 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
4988 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4997 MI.eraseFromParent();
5007 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5008 if (Ty !=
S32 && Ty !=
S64)
5011 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5012 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5013 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5015 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5016 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5017 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5019 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5020 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5022 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5023 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5025 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5026 switch (
MI.getOpcode()) {
5029 case AMDGPU::G_SDIV: {
5030 DstDivReg =
MI.getOperand(0).getReg();
5031 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
5034 case AMDGPU::G_SREM: {
5035 DstRemReg =
MI.getOperand(0).getReg();
5036 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
5039 case AMDGPU::G_SDIVREM: {
5040 DstDivReg =
MI.getOperand(0).getReg();
5041 DstRemReg =
MI.getOperand(1).getReg();
5042 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
5043 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
5054 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5055 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5056 B.buildSub(DstDivReg, SignXor, Sign);
5060 auto Sign = LHSign.getReg(0);
5061 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5062 B.buildSub(DstRemReg, SignXor, Sign);
5065 MI.eraseFromParent();
5076 LLT ResTy =
MRI.getType(Res);
5081 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5092 if (CLHS->isExactlyValue(1.0)) {
5093 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5097 MI.eraseFromParent();
5102 if (CLHS->isExactlyValue(-1.0)) {
5103 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5104 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5105 .addUse(FNeg.getReg(0))
5108 MI.eraseFromParent();
5115 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5120 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5123 B.buildFMul(Res, LHS, RCP, Flags);
5125 MI.eraseFromParent();
5136 LLT ResTy =
MRI.getType(Res);
5140 if (!AllowInaccurateRcp)
5143 auto NegY =
B.buildFNeg(ResTy,
Y);
5144 auto One =
B.buildFConstant(ResTy, 1.0);
5146 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5150 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5151 R =
B.buildFMA(ResTy, Tmp0, R, R);
5153 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5154 R =
B.buildFMA(ResTy, Tmp1, R, R);
5156 auto Ret =
B.buildFMul(ResTy,
X, R);
5157 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5159 B.buildFMA(Res, Tmp2, R, Ret);
5160 MI.eraseFromParent();
5192 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5193 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5194 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5195 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5196 .addUse(RHSExt.getReg(0))
5198 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5200 if (ST.hasMadMacF32Insts()) {
5201 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5202 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5203 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5205 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5206 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5207 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5209 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5210 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5211 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5212 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5213 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5214 .addUse(RDst.getReg(0))
5219 MI.eraseFromParent();
5232 unsigned SPDenormMode =
5235 if (ST.hasDenormModeInst()) {
5237 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5239 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5240 B.buildInstr(AMDGPU::S_DENORM_MODE)
5241 .addImm(NewDenormModeValue);
5244 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5245 .addImm(SPDenormMode)
5267 auto One =
B.buildFConstant(
S32, 1.0f);
5269 auto DenominatorScaled =
5270 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5275 auto NumeratorScaled =
5276 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5282 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5283 .addUse(DenominatorScaled.getReg(0))
5285 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5288 const bool HasDynamicDenormals =
5293 if (!PreservesDenormals) {
5294 if (HasDynamicDenormals) {
5295 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5296 B.buildInstr(AMDGPU::S_GETREG_B32)
5297 .addDef(SavedSPDenormMode)
5303 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5304 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5305 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5306 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5307 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5308 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5310 if (!PreservesDenormals) {
5311 if (HasDynamicDenormals) {
5312 assert(SavedSPDenormMode);
5313 B.buildInstr(AMDGPU::S_SETREG_B32)
5314 .addReg(SavedSPDenormMode)
5320 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5321 .addUse(Fma4.getReg(0))
5322 .addUse(Fma1.getReg(0))
5323 .addUse(Fma3.getReg(0))
5324 .addUse(NumeratorScaled.getReg(1))
5327 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5328 .addUse(Fmas.getReg(0))
5333 MI.eraseFromParent();
5352 auto One =
B.buildFConstant(
S64, 1.0);
5354 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5360 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5362 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5363 .addUse(DivScale0.getReg(0))
5366 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5367 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5368 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5370 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5376 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5377 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5378 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5381 if (!ST.hasUsableDivScaleConditionOutput()) {
5387 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5388 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5389 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5390 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5393 Scale1Unmerge.getReg(1));
5395 Scale0Unmerge.getReg(1));
5396 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5398 Scale = DivScale1.getReg(1);
5401 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5402 .addUse(Fma4.getReg(0))
5403 .addUse(Fma3.getReg(0))
5404 .addUse(
Mul.getReg(0))
5408 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5409 .addUse(Fmas.getReg(0))
5414 MI.eraseFromParent();
5426 LLT Ty =
MRI.getType(Res0);
5429 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5432 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5436 if (ST.hasFractBug()) {
5437 auto Fabs =
B.buildFAbs(Ty, Val);
5441 auto Zero =
B.buildConstant(InstrExpTy, 0);
5442 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5443 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5446 B.buildCopy(Res0, Mant);
5447 B.buildSExtOrTrunc(Res1, Exp);
5449 MI.eraseFromParent();
5464 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5467 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5468 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5469 auto C2 =
B.buildFConstant(
S32, 1.0f);
5472 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5474 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5476 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5477 .addUse(Mul0.getReg(0))
5480 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5482 B.buildFMul(Res, Sel, Mul1, Flags);
5484 MI.eraseFromParent();
5493 unsigned Flags =
MI.getFlags();
5494 assert(!ST.has16BitInsts());
5496 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5497 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5498 .addUse(Ext.getReg(0))
5500 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5501 MI.eraseFromParent();
5511 const unsigned Flags =
MI.getFlags();
5520 MI.eraseFromParent();
5524 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5526 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5527 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5528 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5533 .addUse(SqrtX.getReg(0))
5536 auto NegOne =
B.buildConstant(I32, -1);
5537 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5539 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5540 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5542 auto PosOne =
B.buildConstant(I32, 1);
5543 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5545 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5546 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5548 auto Zero =
B.buildFConstant(
F32, 0.0f);
5552 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5556 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5559 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5560 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5562 auto Half =
B.buildFConstant(
F32, 0.5f);
5563 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5564 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5565 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5566 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5567 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5568 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5569 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5570 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5573 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5575 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5577 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5580 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5582 MI.eraseFromParent();
5614 assert(
MRI.getType(Dst) ==
F64 &&
"only expect to lower f64 sqrt");
5617 unsigned Flags =
MI.getFlags();
5619 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5621 auto ZeroInt =
B.buildConstant(
S32, 0);
5625 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5626 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5627 auto SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags);
5630 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX.getReg(0));
5632 auto Half =
B.buildFConstant(
F64, 0.5);
5633 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5634 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5636 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5637 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5639 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5640 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5642 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5643 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
5645 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
5647 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
5648 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
5650 auto SqrtRet =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
5653 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5654 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5655 SqrtRet =
B.buildFLdexp(
F64, SqrtRet, ScaleDown, Flags);
5664 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5666 MI.eraseFromParent();
5673 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5697 auto Flags =
MI.getFlags();
5699 LLT Ty =
MRI.getType(Dst);
5709 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5719 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5720 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5725 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5727 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5728 MI.eraseFromParent();
5740 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5741 IID == Intrinsic::amdgcn_permlanex16;
5742 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5743 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5747 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
5749 case Intrinsic::amdgcn_readfirstlane:
5750 case Intrinsic::amdgcn_permlane64:
5751 return LaneOp.getReg(0);
5752 case Intrinsic::amdgcn_readlane:
5753 case Intrinsic::amdgcn_set_inactive:
5754 case Intrinsic::amdgcn_set_inactive_chain_arg:
5755 return LaneOp.addUse(Src1).getReg(0);
5756 case Intrinsic::amdgcn_writelane:
5757 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5758 case Intrinsic::amdgcn_permlane16:
5759 case Intrinsic::amdgcn_permlanex16: {
5761 int64_t Src4 =
MI.getOperand(6).getImm();
5762 int64_t Src5 =
MI.getOperand(7).getImm();
5763 return LaneOp.addUse(Src1)
5770 case Intrinsic::amdgcn_mov_dpp8:
5771 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
5772 case Intrinsic::amdgcn_update_dpp:
5773 return LaneOp.addUse(Src1)
5774 .addImm(
MI.getOperand(4).getImm())
5775 .addImm(
MI.getOperand(5).getImm())
5776 .addImm(
MI.getOperand(6).getImm())
5777 .addImm(
MI.getOperand(7).getImm())
5787 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5788 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5789 Src1 =
MI.getOperand(3).getReg();
5790 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5791 Src2 =
MI.getOperand(4).getReg();
5795 LLT Ty =
MRI.getType(DstReg);
5796 unsigned Size = Ty.getSizeInBits();
5798 unsigned SplitSize = 32;
5799 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
5800 ST.hasDPALU_DPP() &&
5804 if (
Size == SplitSize) {
5810 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
5812 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5815 if (IID == Intrinsic::amdgcn_writelane)
5818 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
5819 B.buildTrunc(DstReg, LaneOpDst);
5820 MI.eraseFromParent();
5824 if (
Size % SplitSize != 0)
5828 bool NeedsBitcast =
false;
5829 if (Ty.isVector()) {
5832 if (EltSize == SplitSize) {
5833 PartialResTy = EltTy;
5834 }
else if (EltSize == 16 || EltSize == 32) {
5835 unsigned NElem = SplitSize / EltSize;
5839 NeedsBitcast =
true;
5844 unsigned NumParts =
Size / SplitSize;
5848 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5849 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
5851 if (IID == Intrinsic::amdgcn_writelane)
5852 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
5854 for (
unsigned i = 0; i < NumParts; ++i) {
5855 Src0 = Src0Parts.
getReg(i);
5857 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5858 Src1 = Src1Parts.
getReg(i);
5860 if (IID == Intrinsic::amdgcn_writelane)
5861 Src2 = Src2Parts.
getReg(i);
5863 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5867 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
5870 B.buildMergeLikeInstr(DstReg, PartialRes);
5872 MI.eraseFromParent();
5880 ST.getTargetLowering()->getImplicitParameterOffset(
5882 LLT DstTy =
MRI.getType(DstReg);
5885 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
5890 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
5891 B.buildConstant(IdxTy,
Offset).getReg(0));
5902 Register Pointer =
MI.getOperand(2).getReg();
5904 Register NumRecords =
MI.getOperand(4).getReg();
5910 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5912 auto ExtStride =
B.buildAnyExt(
S32, Stride);
5914 if (ST.has45BitNumRecordsBufferResource()) {
5919 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
5920 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
5921 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
5922 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
5926 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
5927 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
5928 auto ExtShiftedStride =
5929 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
5930 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
5931 auto ExtShiftedFlags =
5932 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
5933 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
5935 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
5936 B.buildMergeValues(Result, {LowHalf, HighHalf});
5938 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
5939 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
5940 auto LowHalf = Unmerge.getReg(0);
5941 auto HighHalf = Unmerge.getReg(1);
5943 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
5944 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
5945 auto ShiftConst =
B.buildConstant(
S32, 16);
5946 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
5947 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
5948 Register NewHighHalfReg = NewHighHalf.getReg(0);
5949 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5952 MI.eraseFromParent();
5969 MI.eraseFromParent();
5977 std::optional<uint32_t> KnownSize =
5979 if (KnownSize.has_value())
5980 B.buildConstant(DstReg, *KnownSize);
5998 MI.eraseFromParent();
6005 unsigned AddrSpace)
const {
6007 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6011 ST.hasGloballyAddressableScratch()) {
6013 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6014 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6016 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6018 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6020 B.buildConstant(
S32, 1u << 26));
6025 MI.eraseFromParent();
6035std::pair<Register, unsigned>
6049 MRI, OrigOffset,
nullptr, CheckNUW);
6052 if (
MRI.getType(BaseReg).isPointer())
6053 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
6063 unsigned Overflow = ImmOffset & ~MaxImm;
6064 ImmOffset -= Overflow;
6065 if ((int32_t)Overflow < 0) {
6066 Overflow += ImmOffset;
6070 if (Overflow != 0) {
6072 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6074 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6075 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6080 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6082 return std::pair(BaseReg, ImmOffset);
6089 bool ImageStore)
const {
6092 LLT StoreVT =
MRI.getType(Reg);
6095 if (ST.hasUnpackedD16VMem()) {
6096 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6099 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6100 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6108 if (ImageStore && ST.hasImageStoreD16Bug()) {
6111 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6113 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6120 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6121 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6123 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6131 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6132 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6134 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6151 bool IsFormat)
const {
6153 LLT Ty =
MRI->getType(VData);
6163 VData =
B.buildBitcast(Ty, VData).getReg(0);
6171 if (Ty.isVector()) {
6172 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6184 bool IsFormat)
const {
6189 LLT Ty =
MRI.getType(VData);
6191 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6206 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6209 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6213 VIndex =
MI.getOperand(3).getReg();
6216 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6219 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6220 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6224 Format =
MI.getOperand(5 + OpOffset).getImm();
6228 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6234 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6235 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6236 }
else if (IsFormat) {
6237 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6238 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6242 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6245 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6248 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6253 auto MIB =
B.buildInstr(
Opc)
6264 MIB.addImm(AuxiliaryData)
6265 .addImm(HasVIndex ? -1 : 0)
6266 .addMemOperand(MMO);
6268 MI.eraseFromParent();
6274 unsigned ImmOffset,
unsigned Format,
6277 auto MIB =
B.buildInstr(
Opc)
6288 MIB.addImm(AuxiliaryData)
6289 .addImm(HasVIndex ? -1 : 0)
6290 .addMemOperand(MMO);
6296 bool IsTyped)
const {
6310 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6311 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6313 StatusDst =
MI.getOperand(1).getReg();
6318 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6321 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6324 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6327 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6330 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6333 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6334 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6338 Format =
MI.getOperand(5 + OpOffset).getImm();
6342 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6345 LLT Ty =
MRI.getType(Dst);
6352 Dst =
MI.getOperand(0).getReg();
6353 B.setInsertPt(
B.getMBB(),
MI);
6360 Dst =
MI.getOperand(0).getReg();
6361 B.setInsertPt(
B.getMBB(),
MI);
6365 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6366 const bool Unpacked = ST.hasUnpackedD16VMem();
6376 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6377 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6378 }
else if (IsFormat) {
6382 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6384 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6385 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6390 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6391 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6394 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6395 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6398 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6399 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6405 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6406 unsigned NumLoadDWords = NumValueDWords + 1;
6408 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6410 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6412 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6413 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6414 B.buildTrunc(Dst, ExtDst);
6415 }
else if (NumValueDWords == 1) {
6416 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6419 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6420 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6422 B.buildUnmerge(LoadElts, LoadDstReg);
6424 B.buildMergeLikeInstr(Dst, LoadElts);
6427 (IsD16 && !Ty.isVector())) {
6428 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6430 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6431 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6432 B.buildTrunc(Dst, LoadDstReg);
6433 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6435 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6437 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6438 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6440 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6442 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6443 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6444 B.buildMergeLikeInstr(Dst, Repack);
6447 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6450 MI.eraseFromParent();
6456 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6457 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6458 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6459 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6460 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6461 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6463 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6464 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6465 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6466 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6467 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6468 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6470 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6471 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6473 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6474 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6475 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6476 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6478 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6479 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6480 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6481 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6482 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6483 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6484 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6485 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6486 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6487 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6488 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6489 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6490 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6491 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6492 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6493 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6495 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6496 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6497 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6498 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6500 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6501 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6502 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6503 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6505 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6506 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6507 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6508 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6509 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6510 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6511 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6512 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6513 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6514 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6515 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6516 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6517 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6518 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6519 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6520 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6521 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6522 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6523 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6524 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6525 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6526 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6527 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6528 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6529 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6530 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6531 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6532 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6533 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6534 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6535 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6536 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6537 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6538 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6547 const bool IsCmpSwap =
6548 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6549 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6550 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6551 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6562 CmpVal =
MI.getOperand(3).getReg();
6567 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6568 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6571 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6574 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6577 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6580 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6581 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6582 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6601 .addImm(AuxiliaryData)
6602 .addImm(HasVIndex ? -1 : 0)
6603 .addMemOperand(MMO);
6605 MI.eraseFromParent();
6615 bool IsA16,
bool IsG16) {
6631 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6636 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6640 "Bias needs to be converted to 16 bit in A16 mode");
6642 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6648 if (((
I + 1) >= EndIdx) ||
6655 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6657 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6662 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6673 int DimIdx,
int NumVAddrs) {
6677 for (
int I = 0;
I != NumVAddrs; ++
I) {
6679 if (
SrcOp.isReg()) {
6685 int NumAddrRegs = AddrRegs.
size();
6686 if (NumAddrRegs != 1) {
6689 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6692 for (
int I = 1;
I != NumVAddrs; ++
I) {
6695 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6717 const unsigned NumDefs =
MI.getNumExplicitDefs();
6718 const unsigned ArgOffset = NumDefs + 1;
6719 bool IsTFE = NumDefs == 2;
6737 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6738 Ty =
MRI->getType(VData);
6741 const bool IsAtomicPacked16Bit =
6742 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6743 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6751 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
6752 const bool IsA16 = AddrTy ==
S16;
6753 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
6756 if (!BaseOpcode->
Atomic) {
6757 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
6760 }
else if (DMask != 0) {
6762 }
else if (!IsTFE && !BaseOpcode->
Store) {
6764 B.buildUndef(
MI.getOperand(0));
6765 MI.eraseFromParent();
6773 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6774 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6775 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6776 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6777 unsigned NewOpcode = LoadOpcode;
6778 if (BaseOpcode->
Store)
6779 NewOpcode = StoreOpcode;
6781 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6784 MI.setDesc(
B.getTII().get(NewOpcode));
6788 if (IsTFE && DMask == 0) {
6791 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
6794 if (BaseOpcode->
Atomic) {
6796 LLT Ty =
MRI->getType(VData0);
6799 if (Ty.isVector() && !IsAtomicPacked16Bit)
6806 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
6807 MI.getOperand(2).setReg(
Concat.getReg(0));
6808 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6812 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
6815 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6821 if (IsA16 && !ST.hasA16()) {
6826 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
6827 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6829 if (IsA16 || IsG16) {
6837 const bool UseNSA = ST.hasNSAEncoding() &&
6838 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
6839 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
6840 const bool UsePartialNSA =
6841 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
6843 if (UsePartialNSA) {
6847 auto Concat =
B.buildConcatVectors(
6848 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6849 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
6850 PackedRegs.
resize(NSAMaxSize);
6851 }
else if (!UseNSA && PackedRegs.
size() > 1) {
6853 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
6854 PackedRegs[0] =
Concat.getReg(0);
6858 const unsigned NumPacked = PackedRegs.
size();
6861 if (!
SrcOp.isReg()) {
6871 SrcOp.setReg(AMDGPU::NoRegister);
6888 const bool UseNSA = ST.hasNSAEncoding() &&
6889 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6890 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6891 const bool UsePartialNSA =
6892 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6894 if (UsePartialNSA) {
6896 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
6898 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
6913 if (!Ty.isVector() || !IsD16)
6917 if (RepackedReg != VData) {
6918 MI.getOperand(1).setReg(RepackedReg);
6926 const int NumElts = Ty.
isVector() ? Ty.getNumElements() : 1;
6929 if (NumElts < DMaskLanes)
6932 if (NumElts > 4 || DMaskLanes > 4)
6942 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6943 const LLT AdjustedTy =
6959 if (IsD16 && ST.hasUnpackedD16VMem()) {
6966 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
6967 unsigned RoundedSize = 32 * RoundedElts;
6971 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
6976 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
6982 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
6986 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6987 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
6989 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
6991 MI.getOperand(0).setReg(NewResultReg);
6999 Dst1Reg =
MI.getOperand(1).getReg();
7000 if (
MRI->getType(Dst1Reg) !=
S32)
7004 MI.removeOperand(1);
7008 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7017 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7019 if (ResultNumRegs == 1) {
7021 ResultRegs[0] = NewResultReg;
7024 for (
int I = 0;
I != NumDataRegs; ++
I)
7025 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
7026 B.buildUnmerge(ResultRegs, NewResultReg);
7031 ResultRegs.
resize(NumDataRegs);
7036 if (IsD16 && !Ty.isVector()) {
7037 B.buildTrunc(DstReg, ResultRegs[0]);
7042 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7043 B.buildBitcast(DstReg, ResultRegs[0]);
7055 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7057 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7058 }
else if (ST.hasUnpackedD16VMem()) {
7060 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7064 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7067 Register Undef =
B.buildUndef(Ty).getReg(0);
7068 for (
int I = 0;
I != NumElts; ++
I)
7073 LLT ResTy =
MRI->getType(ResultRegs[0]);
7075 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7076 B.buildBuildVector(DstReg, ResultRegs);
7080 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7081 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7087 if (ResultRegs.
size() == 1) {
7088 NewResultReg = ResultRegs[0];
7089 }
else if (ResultRegs.
size() == 2) {
7091 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7097 if (
MRI->getType(DstReg).getNumElements() <
7098 MRI->getType(NewResultReg).getNumElements()) {
7099 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7101 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7106 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7107 B.buildConcatVectors(DstReg, ResultRegs);
7116 Register OrigDst =
MI.getOperand(0).getReg();
7118 LLT Ty =
B.getMRI()->getType(OrigDst);
7119 unsigned Size = Ty.getSizeInBits();
7122 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7124 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7125 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7128 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7130 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7139 B.setInsertPt(
B.getMBB(),
MI);
7144 B.setInsertPt(
B.getMBB(),
MI);
7150 MI.setDesc(
B.getTII().get(
Opc));
7151 MI.removeOperand(1);
7154 const unsigned MemSize = (
Size + 7) / 8;
7155 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7162 MI.addMemOperand(MF, MMO);
7163 if (Dst != OrigDst) {
7164 MI.getOperand(0).setReg(Dst);
7165 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7166 B.buildTrunc(OrigDst, Dst);
7188 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7189 MI.removeOperand(0);
7199 if (!ST.isTrapHandlerEnabled() ||
7203 return ST.supportsGetDoorbellID() ?
7216 MI.eraseFromParent();
7226 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7228 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7232 MI.eraseFromParent();
7241 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7248 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7250 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
7266 Register LoadAddr =
MRI.createGenericVirtualRegister(
7268 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7271 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7272 B.buildCopy(SGPR01, Temp);
7273 B.buildInstr(AMDGPU::S_TRAP)
7276 MI.eraseFromParent();
7287 B.buildCopy(SGPR01, LiveIn);
7288 B.buildInstr(AMDGPU::S_TRAP)
7292 MI.eraseFromParent();
7301 if (ST.hasPrivEnabledTrap2NopBug()) {
7302 ST.getInstrInfo()->insertSimulatedTrap(
MRI,
B.getMBB(),
MI,
7304 MI.eraseFromParent();
7308 B.buildInstr(AMDGPU::S_TRAP)
7310 MI.eraseFromParent();
7319 if (!ST.isTrapHandlerEnabled() ||
7323 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7326 B.buildInstr(AMDGPU::S_TRAP)
7330 MI.eraseFromParent();
7343 Register NodePtr =
MI.getOperand(2).getReg();
7344 Register RayExtent =
MI.getOperand(3).getReg();
7345 Register RayOrigin =
MI.getOperand(4).getReg();
7347 Register RayInvDir =
MI.getOperand(6).getReg();
7350 if (!ST.hasGFX10_AEncoding()) {
7353 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7360 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7361 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
7362 const unsigned NumVDataDwords = 4;
7363 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7364 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7366 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7368 const unsigned BaseOpcodes[2][2] = {
7369 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7370 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7371 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7375 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7376 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7377 : AMDGPU::MIMGEncGfx10NSA,
7378 NumVDataDwords, NumVAddrDwords);
7382 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7383 : AMDGPU::MIMGEncGfx10Default,
7384 NumVDataDwords, NumVAddrDwords);
7389 if (UseNSA && IsGFX11Plus) {
7391 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7392 auto Merged =
B.buildMergeLikeInstr(
7393 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7394 Ops.push_back(Merged.getReg(0));
7397 Ops.push_back(NodePtr);
7398 Ops.push_back(RayExtent);
7399 packLanes(RayOrigin);
7402 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7403 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7404 auto MergedDir =
B.buildMergeLikeInstr(
7407 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7408 UnmergeRayDir.getReg(0)}))
7411 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7412 UnmergeRayDir.getReg(1)}))
7415 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7416 UnmergeRayDir.getReg(2)}))
7418 Ops.push_back(MergedDir.getReg(0));
7421 packLanes(RayInvDir);
7425 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7426 Ops.push_back(Unmerge.getReg(0));
7427 Ops.push_back(Unmerge.getReg(1));
7429 Ops.push_back(NodePtr);
7431 Ops.push_back(RayExtent);
7434 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7435 Ops.push_back(Unmerge.getReg(0));
7436 Ops.push_back(Unmerge.getReg(1));
7437 Ops.push_back(Unmerge.getReg(2));
7440 packLanes(RayOrigin);
7442 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7443 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7447 B.buildMergeLikeInstr(R1,
7448 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7449 B.buildMergeLikeInstr(
7450 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7451 B.buildMergeLikeInstr(
7452 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7458 packLanes(RayInvDir);
7465 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7467 Ops.push_back(MergedOps);
7470 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7479 .addImm(IsA16 ? 1 : 0)
7482 MI.eraseFromParent();
7492 Register DstOrigin =
MI.getOperand(1).getReg();
7494 Register NodePtr =
MI.getOperand(4).getReg();
7495 Register RayExtent =
MI.getOperand(5).getReg();
7496 Register InstanceMask =
MI.getOperand(6).getReg();
7497 Register RayOrigin =
MI.getOperand(7).getReg();
7499 Register Offsets =
MI.getOperand(9).getReg();
7500 Register TDescr =
MI.getOperand(10).getReg();
7502 if (!ST.hasBVHDualAndBVH8Insts()) {
7505 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7510 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7511 const unsigned NumVDataDwords = 10;
7512 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7514 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7515 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7516 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7519 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7520 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7522 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7523 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7529 .addUse(RayExtentInstanceMaskVec.getReg(0))
7536 MI.eraseFromParent();
7545 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7546 MI.eraseFromParent();
7553 if (!ST.hasArchitectedSGPRs())
7557 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7558 auto LSB =
B.buildConstant(
S32, 25);
7559 auto Width =
B.buildConstant(
S32, 5);
7560 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7561 MI.eraseFromParent();
7569 unsigned Width)
const {
7572 if (!
MRI.getRegClassOrNull(DstReg))
7573 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7574 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7577 MI.eraseFromParent();
7591 if (
MRI.getType(Src) !=
S64)
7595 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7599 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7602 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7603 MI.eraseFromParent();
7611 if (
MRI.getType(Src) !=
S64)
7614 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7618 .addReg(Unmerge.getReg(0));
7622 .addReg(Unmerge.getReg(1));
7623 MI.eraseFromParent();
7635 case Intrinsic::amdgcn_if:
7636 case Intrinsic::amdgcn_else: {
7639 bool Negated =
false;
7651 std::swap(CondBrTarget, UncondBrTarget);
7653 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7654 if (IntrID == Intrinsic::amdgcn_if) {
7655 B.buildInstr(AMDGPU::SI_IF)
7658 .addMBB(UncondBrTarget);
7660 B.buildInstr(AMDGPU::SI_ELSE)
7663 .addMBB(UncondBrTarget);
7672 B.buildBr(*CondBrTarget);
7675 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7676 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7677 MI.eraseFromParent();
7678 BrCond->eraseFromParent();
7684 case Intrinsic::amdgcn_loop: {
7687 bool Negated =
false;
7697 std::swap(CondBrTarget, UncondBrTarget);
7699 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7700 B.buildInstr(AMDGPU::SI_LOOP)
7702 .addMBB(UncondBrTarget);
7707 B.buildBr(*CondBrTarget);
7709 MI.eraseFromParent();
7710 BrCond->eraseFromParent();
7711 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7717 case Intrinsic::amdgcn_addrspacecast_nonnull:
7719 case Intrinsic::amdgcn_make_buffer_rsrc:
7721 case Intrinsic::amdgcn_kernarg_segment_ptr:
7724 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7725 MI.eraseFromParent();
7731 case Intrinsic::amdgcn_implicitarg_ptr:
7733 case Intrinsic::amdgcn_workitem_id_x:
7736 case Intrinsic::amdgcn_workitem_id_y:
7739 case Intrinsic::amdgcn_workitem_id_z:
7742 case Intrinsic::amdgcn_workgroup_id_x:
7747 case Intrinsic::amdgcn_workgroup_id_y:
7752 case Intrinsic::amdgcn_workgroup_id_z:
7757 case Intrinsic::amdgcn_cluster_id_x:
7758 return ST.hasClusters() &&
7761 case Intrinsic::amdgcn_cluster_id_y:
7762 return ST.hasClusters() &&
7765 case Intrinsic::amdgcn_cluster_id_z:
7766 return ST.hasClusters() &&
7769 case Intrinsic::amdgcn_cluster_workgroup_id_x:
7770 return ST.hasClusters() &&
7773 case Intrinsic::amdgcn_cluster_workgroup_id_y:
7774 return ST.hasClusters() &&
7777 case Intrinsic::amdgcn_cluster_workgroup_id_z:
7778 return ST.hasClusters() &&
7781 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7782 return ST.hasClusters() &&
7784 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7785 return ST.hasClusters() &&
7788 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7789 return ST.hasClusters() &&
7792 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7793 return ST.hasClusters() &&
7796 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7797 return ST.hasClusters() &&
7801 case Intrinsic::amdgcn_wave_id:
7803 case Intrinsic::amdgcn_lds_kernel_id:
7806 case Intrinsic::amdgcn_dispatch_ptr:
7809 case Intrinsic::amdgcn_queue_ptr:
7812 case Intrinsic::amdgcn_implicit_buffer_ptr:
7815 case Intrinsic::amdgcn_dispatch_id:
7818 case Intrinsic::r600_read_ngroups_x:
7822 case Intrinsic::r600_read_ngroups_y:
7825 case Intrinsic::r600_read_ngroups_z:
7828 case Intrinsic::r600_read_local_size_x:
7831 case Intrinsic::r600_read_local_size_y:
7835 case Intrinsic::r600_read_local_size_z:
7838 case Intrinsic::amdgcn_fdiv_fast:
7840 case Intrinsic::amdgcn_is_shared:
7842 case Intrinsic::amdgcn_is_private:
7844 case Intrinsic::amdgcn_wavefrontsize: {
7845 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
7846 MI.eraseFromParent();
7849 case Intrinsic::amdgcn_s_buffer_load:
7851 case Intrinsic::amdgcn_raw_buffer_store:
7852 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7853 case Intrinsic::amdgcn_struct_buffer_store:
7854 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7856 case Intrinsic::amdgcn_raw_buffer_store_format:
7857 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7858 case Intrinsic::amdgcn_struct_buffer_store_format:
7859 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7861 case Intrinsic::amdgcn_raw_tbuffer_store:
7862 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7863 case Intrinsic::amdgcn_struct_tbuffer_store:
7864 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7866 case Intrinsic::amdgcn_raw_buffer_load:
7867 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7868 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7869 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7870 case Intrinsic::amdgcn_struct_buffer_load:
7871 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7872 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7873 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7875 case Intrinsic::amdgcn_raw_buffer_load_format:
7876 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7877 case Intrinsic::amdgcn_struct_buffer_load_format:
7878 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7880 case Intrinsic::amdgcn_raw_tbuffer_load:
7881 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7882 case Intrinsic::amdgcn_struct_tbuffer_load:
7883 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7885 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7886 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7887 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7888 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7889 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7890 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7891 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7892 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7893 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7895 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7896 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7897 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7898 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7899 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7900 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7901 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7903 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7905 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7907 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7908 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7909 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7910 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7911 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7912 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7913 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7914 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7915 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7916 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7917 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7918 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7919 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7920 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7921 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7923 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7925 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7926 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7927 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7928 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7929 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7930 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7931 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7932 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7933 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7934 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7935 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7936 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7937 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7938 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7939 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7940 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7941 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7943 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7945 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7946 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7947 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7950 case Intrinsic::amdgcn_rsq_clamp:
7952 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7954 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
7955 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
7957 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
7958 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
7959 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
7960 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
7961 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
7962 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
7963 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
7964 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
7967 if (
MRI.getType(Index) !=
S64)
7968 MI.getOperand(5).setReg(
B.buildAnyExt(
S64, Index).getReg(0));
7971 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7972 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7973 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7974 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7975 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7976 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7977 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7978 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7981 if (
MRI.getType(Index) !=
S32)
7982 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
7985 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
7986 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
7987 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
7988 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
7989 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
7990 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
7991 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7992 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7993 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7995 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
7998 if (
MRI.getType(Index) != IdxTy)
7999 MI.getOperand(7).setReg(
B.buildAnyExt(IdxTy, Index).getReg(0));
8003 case Intrinsic::amdgcn_fmed3: {
8009 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8010 MI.removeOperand(1);
8014 case Intrinsic::amdgcn_readlane:
8015 case Intrinsic::amdgcn_writelane:
8016 case Intrinsic::amdgcn_readfirstlane:
8017 case Intrinsic::amdgcn_permlane16:
8018 case Intrinsic::amdgcn_permlanex16:
8019 case Intrinsic::amdgcn_permlane64:
8020 case Intrinsic::amdgcn_set_inactive:
8021 case Intrinsic::amdgcn_set_inactive_chain_arg:
8022 case Intrinsic::amdgcn_mov_dpp8:
8023 case Intrinsic::amdgcn_update_dpp:
8025 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8027 case Intrinsic::amdgcn_dead: {
8031 MI.eraseFromParent();
8034 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8035 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8036 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8037 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8038 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8039 MI.eraseFromParent();
8041 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8042 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8043 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8044 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8045 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8046 MI.eraseFromParent();
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
const std::array< unsigned, 3 > & getDims() const
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
The instances of the Type class are immutable: once they are created, they are never changed.
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX1250(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
int popcount(T Value) noexcept
Count the number of set bits in a value.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.