Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
35#include "llvm/IR/IntrinsicsAMDGPU.h"
36#include "llvm/IR/IntrinsicsR600.h"
37
38#define DEBUG_TYPE "amdgpu-legalinfo"
39
40using namespace llvm;
41using namespace LegalizeActions;
42using namespace LegalizeMutations;
43using namespace LegalityPredicates;
44using namespace MIPatternMatch;
45
46// Hack until load/store selection patterns support any tuple of legal types.
48 "amdgpu-global-isel-new-legality",
49 cl::desc("Use GlobalISel desired legality, rather than try to use"
50 "rules compatible with selection patterns"),
51 cl::init(false),
53
54static constexpr unsigned MaxRegisterSize = 1024;
55
56// Round the number of elements to the next power of two elements
58 unsigned NElts = Ty.getNumElements();
59 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
60 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
61}
62
63// Round the number of bits to the next power of two bits
65 unsigned Bits = Ty.getSizeInBits();
66 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
67 return LLT::scalar(Pow2Bits);
68}
69
70/// \returns true if this is an odd sized vector which should widen by adding an
71/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
72/// excludes s1 vectors, which should always be scalarized.
73static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
74 return [=](const LegalityQuery &Query) {
75 const LLT Ty = Query.Types[TypeIdx];
76 if (!Ty.isVector())
77 return false;
78
79 const LLT EltTy = Ty.getElementType();
80 const unsigned EltSize = EltTy.getSizeInBits();
81 return Ty.getNumElements() % 2 != 0 &&
82 EltSize > 1 && EltSize < 32 &&
83 Ty.getSizeInBits() % 32 != 0;
84 };
85}
86
87static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 return Ty.getSizeInBits() % 32 == 0;
91 };
92}
93
94static LegalityPredicate isWideVec16(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getScalarType();
98 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
99 };
100}
101
102static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
103 return [=](const LegalityQuery &Query) {
104 const LLT Ty = Query.Types[TypeIdx];
105 const LLT EltTy = Ty.getElementType();
106 return std::pair(TypeIdx,
107 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
108 };
109}
110
112 return [=](const LegalityQuery &Query) {
113 const LLT Ty = Query.Types[TypeIdx];
114 const LLT EltTy = Ty.getElementType();
115 unsigned Size = Ty.getSizeInBits();
116 unsigned Pieces = (Size + 63) / 64;
117 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
118 return std::pair(TypeIdx, LLT::scalarOrVector(
119 ElementCount::getFixed(NewNumElts), EltTy));
120 };
121}
122
123// Increase the number of vector elements to reach the next multiple of 32-bit
124// type.
125static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
126 return [=](const LegalityQuery &Query) {
127 const LLT Ty = Query.Types[TypeIdx];
128
129 const LLT EltTy = Ty.getElementType();
130 const int Size = Ty.getSizeInBits();
131 const int EltSize = EltTy.getSizeInBits();
132 const int NextMul32 = (Size + 31) / 32;
133
134 assert(EltSize < 32);
135
136 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
137 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
138 };
139}
140
141// Retrieves the scalar type that's the same size as the mem desc
143 return [=](const LegalityQuery &Query) {
144 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
145 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
146 };
147}
148
149// Increase the number of vector elements to reach the next legal RegClass.
151 return [=](const LegalityQuery &Query) {
152 const LLT Ty = Query.Types[TypeIdx];
153 const unsigned NumElts = Ty.getNumElements();
154 const unsigned EltSize = Ty.getElementType().getSizeInBits();
155 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
156
157 assert(EltSize == 32 || EltSize == 64);
158 assert(Ty.getSizeInBits() < MaxRegisterSize);
159
160 unsigned NewNumElts;
161 // Find the nearest legal RegClass that is larger than the current type.
162 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
163 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
164 break;
165 }
166 return std::pair(TypeIdx,
167 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
168 };
169}
170
172 if (!Ty.isVector())
173 return LLT::scalar(128);
174 const ElementCount NumElems = Ty.getElementCount();
175 return LLT::vector(NumElems, LLT::scalar(128));
176}
177
179 if (!Ty.isVector())
180 return LLT::fixed_vector(4, LLT::scalar(32));
181 const unsigned NumElems = Ty.getElementCount().getFixedValue();
182 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
183}
184
186 const unsigned Size = Ty.getSizeInBits();
187
188 if (Size <= 32) {
189 // <2 x s8> -> s16
190 // <4 x s8> -> s32
191 return LLT::scalar(Size);
192 }
193
195}
196
197static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
198 return [=](const LegalityQuery &Query) {
199 const LLT Ty = Query.Types[TypeIdx];
200 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
201 };
202}
203
205 return [=](const LegalityQuery &Query) {
206 const LLT Ty = Query.Types[TypeIdx];
207 unsigned Size = Ty.getSizeInBits();
208 assert(Size % 32 == 0);
209 return std::pair(
211 };
212}
213
214static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
215 return [=](const LegalityQuery &Query) {
216 const LLT QueryTy = Query.Types[TypeIdx];
217 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
218 };
219}
220
221static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
222 return [=](const LegalityQuery &Query) {
223 const LLT QueryTy = Query.Types[TypeIdx];
224 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
225 };
226}
227
228static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
229 return [=](const LegalityQuery &Query) {
230 const LLT QueryTy = Query.Types[TypeIdx];
231 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
232 };
233}
234
235static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
236 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
238}
239
241 const int EltSize = EltTy.getSizeInBits();
242 return EltSize == 16 || EltSize % 32 == 0;
243}
244
245static bool isRegisterVectorType(LLT Ty) {
246 const int EltSize = Ty.getElementType().getSizeInBits();
247 return EltSize == 32 || EltSize == 64 ||
248 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
249 EltSize == 128 || EltSize == 256;
250}
251
252// TODO: replace all uses of isRegisterType with isRegisterClassType
253static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
254 if (!isRegisterSize(ST, Ty.getSizeInBits()))
255 return false;
256
257 if (Ty.isVector())
258 return isRegisterVectorType(Ty);
259
260 return true;
261}
262
263// Any combination of 32 or 64-bit elements up the maximum register size, and
264// multiples of v2s16.
266 unsigned TypeIdx) {
267 return [=, &ST](const LegalityQuery &Query) {
268 return isRegisterType(ST, Query.Types[TypeIdx]);
269 };
270}
271
272// RegisterType that doesn't have a corresponding RegClass.
273// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
274// should be removed.
276 unsigned TypeIdx) {
277 return [=, &ST](const LegalityQuery &Query) {
278 LLT Ty = Query.Types[TypeIdx];
279 return isRegisterType(ST, Ty) &&
280 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
281 };
282}
283
284static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
285 return [=](const LegalityQuery &Query) {
286 const LLT QueryTy = Query.Types[TypeIdx];
287 if (!QueryTy.isVector())
288 return false;
289 const LLT EltTy = QueryTy.getElementType();
290 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
291 };
292}
293
294constexpr LLT S1 = LLT::scalar(1);
295constexpr LLT S8 = LLT::scalar(8);
296constexpr LLT S16 = LLT::scalar(16);
297constexpr LLT S32 = LLT::scalar(32);
298constexpr LLT F32 = LLT::float32();
299constexpr LLT S64 = LLT::scalar(64);
300constexpr LLT F64 = LLT::float64();
301constexpr LLT S96 = LLT::scalar(96);
302constexpr LLT S128 = LLT::scalar(128);
303constexpr LLT S160 = LLT::scalar(160);
304constexpr LLT S192 = LLT::scalar(192);
305constexpr LLT S224 = LLT::scalar(224);
306constexpr LLT S256 = LLT::scalar(256);
307constexpr LLT S512 = LLT::scalar(512);
308constexpr LLT S1024 = LLT::scalar(1024);
310
311constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
312constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
313constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
314constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
315constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
316constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
317constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
318constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
319
321constexpr LLT V2BF16 = V2F16; // FIXME
322
323constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
324constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
325constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
326constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
327constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
328constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
329constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
330constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
331constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
332constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
333constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
334constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
335constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
336
337constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
338constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
339constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
340constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
341constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
342constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
343constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
344constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
345
346constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
347constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
348
349constexpr std::initializer_list<LLT> AllScalarTypes = {
351
352constexpr std::initializer_list<LLT> AllS16Vectors{
354
355constexpr std::initializer_list<LLT> AllS32Vectors = {
358
359constexpr std::initializer_list<LLT> AllS64Vectors = {
361
367
368// Checks whether a type is in the list of legal register types.
369static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
370 if (Ty.isPointerOrPointerVector())
371 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
372
375 (ST.useRealTrue16Insts() && Ty == S16) ||
377}
378
380 unsigned TypeIdx) {
381 return [&ST, TypeIdx](const LegalityQuery &Query) {
382 return isRegisterClassType(ST, Query.Types[TypeIdx]);
383 };
384}
385
386// If we have a truncating store or an extending load with a data size larger
387// than 32-bits, we need to reduce to a 32-bit type.
389 return [=](const LegalityQuery &Query) {
390 const LLT Ty = Query.Types[TypeIdx];
391 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
392 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
393 };
394}
395
396// If we have a truncating store or an extending load with a data size larger
397// than 32-bits and mem location is a power of 2
399 return [=](const LegalityQuery &Query) {
400 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
401 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
402 isPowerOf2_64(MemSize);
403 };
404}
405
406// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
407// handle some operations by just promoting the register during
408// selection. There are also d16 loads on GFX9+ which preserve the high bits.
409static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
410 bool IsLoad, bool IsAtomic) {
411 switch (AS) {
413 // FIXME: Private element size.
414 return ST.enableFlatScratch() ? 128 : 32;
416 return ST.useDS128() ? 128 : 64;
421 // Treat constant and global as identical. SMRD loads are sometimes usable for
422 // global loads (ideally constant address space should be eliminated)
423 // depending on the context. Legality cannot be context dependent, but
424 // RegBankSelect can split the load as necessary depending on the pointer
425 // register bank/uniformity and if the memory is invariant or not written in a
426 // kernel.
427 return IsLoad ? 512 : 128;
428 default:
429 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
430 // if they may alias scratch depending on the subtarget. This needs to be
431 // moved to custom handling to use addressMayBeAccessedAsPrivate
432 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
433 }
434}
435
436static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
437 const LegalityQuery &Query) {
438 const LLT Ty = Query.Types[0];
439
440 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
441 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
442
443 unsigned RegSize = Ty.getSizeInBits();
444 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
445 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
446 unsigned AS = Query.Types[1].getAddressSpace();
447
448 // All of these need to be custom lowered to cast the pointer operand.
450 return false;
451
452 // Do not handle extending vector loads.
453 if (Ty.isVector() && MemSize != RegSize)
454 return false;
455
456 // TODO: We should be able to widen loads if the alignment is high enough, but
457 // we also need to modify the memory access size.
458#if 0
459 // Accept widening loads based on alignment.
460 if (IsLoad && MemSize < Size)
461 MemSize = std::max(MemSize, Align);
462#endif
463
464 // Only 1-byte and 2-byte to 32-bit extloads are valid.
465 if (MemSize != RegSize && RegSize != 32)
466 return false;
467
468 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
469 Query.MMODescrs[0].Ordering !=
471 return false;
472
473 switch (MemSize) {
474 case 8:
475 case 16:
476 case 32:
477 case 64:
478 case 128:
479 break;
480 case 96:
481 if (!ST.hasDwordx3LoadStores())
482 return false;
483 break;
484 case 256:
485 case 512:
486 // These may contextually need to be broken down.
487 break;
488 default:
489 return false;
490 }
491
492 assert(RegSize >= MemSize);
493
494 if (AlignBits < MemSize) {
495 const SITargetLowering *TLI = ST.getTargetLowering();
496 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
497 Align(AlignBits / 8)))
498 return false;
499 }
500
501 return true;
502}
503
504// The newer buffer intrinsic forms take their resource arguments as
505// pointers in address space 8, aka s128 values. However, in order to not break
506// SelectionDAG, the underlying operations have to continue to take v4i32
507// arguments. Therefore, we convert resource pointers - or vectors of them
508// to integer values here.
509static bool hasBufferRsrcWorkaround(const LLT Ty) {
510 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
511 return true;
512 if (Ty.isVector()) {
513 const LLT ElemTy = Ty.getElementType();
514 return hasBufferRsrcWorkaround(ElemTy);
515 }
516 return false;
517}
518
519// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
520// workaround this. Eventually it should ignore the type for loads and only care
521// about the size. Return true in cases where we will workaround this for now by
522// bitcasting.
523static bool loadStoreBitcastWorkaround(const LLT Ty) {
525 return false;
526
527 const unsigned Size = Ty.getSizeInBits();
528 if (Ty.isPointerVector())
529 return true;
530 if (Size <= 64)
531 return false;
532 // Address space 8 pointers get their own workaround.
534 return false;
535 if (!Ty.isVector())
536 return true;
537
538 unsigned EltSize = Ty.getScalarSizeInBits();
539 return EltSize != 32 && EltSize != 64;
540}
541
542static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
543 const LLT Ty = Query.Types[0];
544 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
546}
547
548/// Return true if a load or store of the type should be lowered with a bitcast
549/// to a different type.
550static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
551 const LLT MemTy) {
552 const unsigned MemSizeInBits = MemTy.getSizeInBits();
553 const unsigned Size = Ty.getSizeInBits();
554 if (Size != MemSizeInBits)
555 return Size <= 32 && Ty.isVector();
556
558 return true;
559
560 // Don't try to handle bitcasting vector ext loads for now.
561 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
562 (Size <= 32 || isRegisterSize(ST, Size)) &&
563 !isRegisterVectorElementType(Ty.getElementType());
564}
565
566/// Return true if we should legalize a load by widening an odd sized memory
567/// access up to the alignment. Note this case when the memory access itself
568/// changes, not the size of the result register.
569static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
570 uint64_t AlignInBits, unsigned AddrSpace,
571 unsigned Opcode) {
572 unsigned SizeInBits = MemoryTy.getSizeInBits();
573 // We don't want to widen cases that are naturally legal.
574 if (isPowerOf2_32(SizeInBits))
575 return false;
576
577 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
578 // end up widening these for a scalar load during RegBankSelect, if we don't
579 // have 96-bit scalar loads.
580 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
581 return false;
582
583 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
584 return false;
585
586 // A load is known dereferenceable up to the alignment, so it's legal to widen
587 // to it.
588 //
589 // TODO: Could check dereferenceable for less aligned cases.
590 unsigned RoundedSize = NextPowerOf2(SizeInBits);
591 if (AlignInBits < RoundedSize)
592 return false;
593
594 // Do not widen if it would introduce a slow unaligned load.
595 const SITargetLowering *TLI = ST.getTargetLowering();
596 unsigned Fast = 0;
598 RoundedSize, AddrSpace, Align(AlignInBits / 8),
600 Fast;
601}
602
603static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
604 unsigned Opcode) {
605 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
606 return false;
607
608 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
609 Query.MMODescrs[0].AlignInBits,
610 Query.Types[1].getAddressSpace(), Opcode);
611}
612
613/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
614/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
615/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
617 MachineRegisterInfo &MRI, unsigned Idx) {
618 MachineOperand &MO = MI.getOperand(Idx);
619
620 const LLT PointerTy = MRI.getType(MO.getReg());
621
622 // Paranoidly prevent us from doing this multiple times.
624 return PointerTy;
625
626 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
627 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
628 if (!PointerTy.isVector()) {
629 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
630 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
631 const LLT S32 = LLT::scalar(32);
632
633 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
634 std::array<Register, 4> VectorElems;
635 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
636 for (unsigned I = 0; I < NumParts; ++I)
637 VectorElems[I] =
638 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
639 B.buildMergeValues(MO, VectorElems);
640 MO.setReg(VectorReg);
641 return VectorTy;
642 }
643 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
644 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
645 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
646 B.buildIntToPtr(MO, Scalar);
647 MO.setReg(BitcastReg);
648
649 return VectorTy;
650}
651
652/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
653/// the form in which the value must be in order to be passed to the low-level
654/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
655/// needed in order to account for the fact that we can't define a register
656/// class for s128 without breaking SelectionDAG.
658 MachineRegisterInfo &MRI = *B.getMRI();
659 const LLT PointerTy = MRI.getType(Pointer);
660 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
661 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
662
663 if (!PointerTy.isVector()) {
664 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
665 SmallVector<Register, 4> PointerParts;
666 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
667 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
668 for (unsigned I = 0; I < NumParts; ++I)
669 PointerParts.push_back(Unmerged.getReg(I));
670 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
671 }
672 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
673 return B.buildBitcast(VectorTy, Scalar).getReg(0);
674}
675
677 unsigned Idx) {
678 MachineOperand &MO = MI.getOperand(Idx);
679
680 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
681 // Paranoidly prevent us from doing this multiple times.
683 return;
685}
686
688 const GCNTargetMachine &TM)
689 : ST(ST_) {
690 using namespace TargetOpcode;
691
692 auto GetAddrSpacePtr = [&TM](unsigned AS) {
693 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
694 };
695
696 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
697 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
698 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
699 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
700 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
701 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
702 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
703 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
704 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
705 const LLT BufferStridedPtr =
706 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
707
708 const LLT CodePtr = FlatPtr;
709
710 const std::initializer_list<LLT> AddrSpaces64 = {
711 GlobalPtr, ConstantPtr, FlatPtr
712 };
713
714 const std::initializer_list<LLT> AddrSpaces32 = {
715 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
716 };
717
718 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
719
720 const std::initializer_list<LLT> FPTypesBase = {
721 S32, S64
722 };
723
724 const std::initializer_list<LLT> FPTypes16 = {
725 S32, S64, S16
726 };
727
728 const std::initializer_list<LLT> FPTypesPK16 = {
729 S32, S64, S16, V2S16
730 };
731
732 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
733
734 // s1 for VCC branches, s32 for SCC branches.
736
737 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
738 // elements for v3s16
741 .legalFor(AllS32Vectors)
743 .legalFor(AddrSpaces64)
744 .legalFor(AddrSpaces32)
745 .legalFor(AddrSpaces128)
746 .legalIf(isPointer(0))
747 .clampScalar(0, S16, S256)
749 .clampMaxNumElements(0, S32, 16)
751 .scalarize(0);
752
753 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
754 // Full set of gfx9 features.
755 if (ST.hasScalarAddSub64()) {
756 getActionDefinitionsBuilder({G_ADD, G_SUB})
757 .legalFor({S64, S32, S16, V2S16})
758 .clampMaxNumElementsStrict(0, S16, 2)
759 .scalarize(0)
760 .minScalar(0, S16)
762 .maxScalar(0, S32);
763 } else {
764 getActionDefinitionsBuilder({G_ADD, G_SUB})
765 .legalFor({S32, S16, V2S16})
766 .clampMaxNumElementsStrict(0, S16, 2)
767 .scalarize(0)
768 .minScalar(0, S16)
770 .maxScalar(0, S32);
771 }
772
773 if (ST.hasScalarSMulU64()) {
775 .legalFor({S64, S32, S16, V2S16})
776 .clampMaxNumElementsStrict(0, S16, 2)
777 .scalarize(0)
778 .minScalar(0, S16)
780 .custom();
781 } else {
783 .legalFor({S32, S16, V2S16})
784 .clampMaxNumElementsStrict(0, S16, 2)
785 .scalarize(0)
786 .minScalar(0, S16)
788 .custom();
789 }
790 assert(ST.hasMad64_32());
791
792 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
793 .legalFor({S32, S16, V2S16}) // Clamp modifier
794 .minScalarOrElt(0, S16)
796 .scalarize(0)
798 .lower();
799 } else if (ST.has16BitInsts()) {
800 getActionDefinitionsBuilder({G_ADD, G_SUB})
801 .legalFor({S32, S16})
802 .minScalar(0, S16)
804 .maxScalar(0, S32)
805 .scalarize(0);
806
808 .legalFor({S32, S16})
809 .scalarize(0)
810 .minScalar(0, S16)
812 .custom();
813 assert(ST.hasMad64_32());
814
815 // Technically the saturating operations require clamp bit support, but this
816 // was introduced at the same time as 16-bit operations.
817 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818 .legalFor({S32, S16}) // Clamp modifier
819 .minScalar(0, S16)
820 .scalarize(0)
822 .lower();
823
824 // We're just lowering this, but it helps get a better result to try to
825 // coerce to the desired type first.
826 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
827 .minScalar(0, S16)
828 .scalarize(0)
829 .lower();
830 } else {
831 getActionDefinitionsBuilder({G_ADD, G_SUB})
832 .legalFor({S32})
833 .widenScalarToNextMultipleOf(0, 32)
834 .clampScalar(0, S32, S32)
835 .scalarize(0);
836
837 auto &Mul = getActionDefinitionsBuilder(G_MUL)
838 .legalFor({S32})
839 .scalarize(0)
840 .minScalar(0, S32)
842
843 if (ST.hasMad64_32())
844 Mul.custom();
845 else
846 Mul.maxScalar(0, S32);
847
848 if (ST.hasIntClamp()) {
849 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
850 .legalFor({S32}) // Clamp modifier.
851 .scalarize(0)
853 .lower();
854 } else {
855 // Clamp bit support was added in VI, along with 16-bit operations.
856 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
857 .minScalar(0, S32)
858 .scalarize(0)
859 .lower();
860 }
861
862 // FIXME: DAG expansion gets better results. The widening uses the smaller
863 // range values and goes for the min/max lowering directly.
864 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
865 .minScalar(0, S32)
866 .scalarize(0)
867 .lower();
868 }
869
871 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
872 .customFor({S32, S64})
873 .clampScalar(0, S32, S64)
875 .scalarize(0);
876
877 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
878 .legalFor({S32})
879 .maxScalar(0, S32);
880
881 if (ST.hasVOP3PInsts()) {
882 Mulh
883 .clampMaxNumElements(0, S8, 2)
884 .lowerFor({V2S8});
885 }
886
887 Mulh
888 .scalarize(0)
889 .lower();
890
891 // Report legal for any types we can handle anywhere. For the cases only legal
892 // on the SALU, RegBankSelect will be able to re-legalize.
893 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
894 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
895 .clampScalar(0, S32, S64)
901 .scalarize(0);
902
904 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
905 .legalFor({{S32, S1}, {S32, S32}})
906 .clampScalar(0, S32, S32)
907 .scalarize(0);
908
910 // Don't worry about the size constraint.
912 .lower();
913
915 .legalFor({S1, S32, S64, S16, GlobalPtr,
916 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
917 .legalIf(isPointer(0))
918 .clampScalar(0, S32, S64)
920
921 getActionDefinitionsBuilder(G_FCONSTANT)
922 .legalFor({S32, S64, S16})
923 .clampScalar(0, S16, S64);
924
925 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
926 .legalIf(isRegisterClassType(ST, 0))
927 // s1 and s16 are special cases because they have legal operations on
928 // them, but don't really occupy registers in the normal way.
929 .legalFor({S1, S16})
930 .clampNumElements(0, V16S32, V32S32)
934 .clampMaxNumElements(0, S32, 16);
935
936 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
937
938 // If the amount is divergent, we have to do a wave reduction to get the
939 // maximum value, so this is expanded during RegBankSelect.
940 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
941 .legalFor({{PrivatePtr, S32}});
942
943 getActionDefinitionsBuilder(G_STACKSAVE)
944 .customFor({PrivatePtr});
945 getActionDefinitionsBuilder(G_STACKRESTORE)
946 .legalFor({PrivatePtr});
947
948 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
949
950 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
951 .customIf(typeIsNot(0, PrivatePtr));
952
953 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
954
955 auto &FPOpActions = getActionDefinitionsBuilder(
956 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
957 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
958 .legalFor({S32, S64});
959 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
960 .customFor({S32, S64});
961 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
962 .customFor({S32, S64});
963
964 if (ST.has16BitInsts()) {
965 if (ST.hasVOP3PInsts())
966 FPOpActions.legalFor({S16, V2S16});
967 else
968 FPOpActions.legalFor({S16});
969
970 TrigActions.customFor({S16});
971 FDIVActions.customFor({S16});
972 }
973
974 if (ST.hasPackedFP32Ops()) {
975 FPOpActions.legalFor({V2S32});
976 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
977 }
978
979 auto &MinNumMaxNum = getActionDefinitionsBuilder(
980 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
981 G_FMAXNUM_IEEE});
982
983 if (ST.hasVOP3PInsts()) {
984 MinNumMaxNum.customFor(FPTypesPK16)
985 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
986 .clampMaxNumElements(0, S16, 2)
987 .clampScalar(0, S16, S64)
988 .scalarize(0);
989 } else if (ST.has16BitInsts()) {
990 MinNumMaxNum.customFor(FPTypes16)
991 .clampScalar(0, S16, S64)
992 .scalarize(0);
993 } else {
994 MinNumMaxNum.customFor(FPTypesBase)
995 .clampScalar(0, S32, S64)
996 .scalarize(0);
997 }
998
999 if (ST.hasVOP3PInsts())
1000 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1001
1002 FPOpActions
1003 .scalarize(0)
1004 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1005
1006 TrigActions
1007 .scalarize(0)
1008 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1009
1010 FDIVActions
1011 .scalarize(0)
1012 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1013
1014 getActionDefinitionsBuilder({G_FNEG, G_FABS})
1015 .legalFor(FPTypesPK16)
1017 .scalarize(0)
1018 .clampScalar(0, S16, S64);
1019
1020 if (ST.has16BitInsts()) {
1022 .legalFor({S16})
1023 .customFor({S32, S64})
1024 .scalarize(0)
1025 .unsupported();
1027 .legalFor({S32, S64, S16})
1028 .scalarize(0)
1029 .clampScalar(0, S16, S64);
1030
1031 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1032 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1033 .scalarize(0)
1034 .maxScalarIf(typeIs(0, S16), 1, S16)
1035 .clampScalar(1, S32, S32)
1036 .lower();
1037
1039 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1040 .scalarize(0)
1041 .lower();
1042 } else {
1044 .customFor({S32, S64, S16})
1045 .scalarize(0)
1046 .unsupported();
1047
1048
1049 if (ST.hasFractBug()) {
1051 .customFor({S64})
1052 .legalFor({S32, S64})
1053 .scalarize(0)
1054 .clampScalar(0, S32, S64);
1055 } else {
1057 .legalFor({S32, S64})
1058 .scalarize(0)
1059 .clampScalar(0, S32, S64);
1060 }
1061
1062 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1063 .legalFor({{S32, S32}, {S64, S32}})
1064 .scalarize(0)
1065 .clampScalar(0, S32, S64)
1066 .clampScalar(1, S32, S32)
1067 .lower();
1068
1070 .customFor({{S32, S32}, {S64, S32}})
1071 .scalarize(0)
1072 .minScalar(0, S32)
1073 .clampScalar(1, S32, S32)
1074 .lower();
1075 }
1076
1077 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1078 if (ST.hasCvtPkF16F32Inst()) {
1079 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1080 .clampMaxNumElements(0, S16, 2);
1081 } else {
1082 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1083 }
1084 FPTruncActions.scalarize(0).lower();
1085
1087 .legalFor({{S64, S32}, {S32, S16}})
1088 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1089 .scalarize(0);
1090
1091 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1092 if (ST.has16BitInsts()) {
1093 FSubActions
1094 // Use actual fsub instruction
1095 .legalFor({S32, S16})
1096 // Must use fadd + fneg
1097 .lowerFor({S64, V2S16});
1098 } else {
1099 FSubActions
1100 // Use actual fsub instruction
1101 .legalFor({S32})
1102 // Must use fadd + fneg
1103 .lowerFor({S64, S16, V2S16});
1104 }
1105
1106 FSubActions
1107 .scalarize(0)
1108 .clampScalar(0, S32, S64);
1109
1110 // Whether this is legal depends on the floating point mode for the function.
1111 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1112 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1113 FMad.customFor({S32, S16});
1114 else if (ST.hasMadMacF32Insts())
1115 FMad.customFor({S32});
1116 else if (ST.hasMadF16())
1117 FMad.customFor({S16});
1118 FMad.scalarize(0)
1119 .lower();
1120
1121 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1122 if (ST.has16BitInsts()) {
1123 FRem.customFor({S16, S32, S64});
1124 } else {
1125 FRem.minScalar(0, S32)
1126 .customFor({S32, S64});
1127 }
1128 FRem.scalarize(0);
1129
1130 // TODO: Do we need to clamp maximum bitwidth?
1132 .legalIf(isScalar(0))
1133 .legalFor({{V2S16, V2S32}})
1134 .clampMaxNumElements(0, S16, 2)
1135 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1136 // situations (like an invalid implicit use), we don't want to infinite loop
1137 // in the legalizer.
1139 .alwaysLegal();
1140
1141 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1142 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1143 {S32, S1}, {S64, S1}, {S16, S1}})
1144 .scalarize(0)
1145 .clampScalar(0, S32, S64)
1146 .widenScalarToNextPow2(1, 32);
1147
1148 // TODO: Split s1->s64 during regbankselect for VALU.
1149 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1150 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1151 .lowerIf(typeIs(1, S1))
1152 .customFor({{S32, S64}, {S64, S64}});
1153 if (ST.has16BitInsts())
1154 IToFP.legalFor({{S16, S16}});
1155 IToFP.clampScalar(1, S32, S64)
1156 .minScalar(0, S32)
1157 .scalarize(0)
1159
1160 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1161 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1162 .customFor({{S64, S32}, {S64, S64}})
1163 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1164 if (ST.has16BitInsts())
1165 FPToI.legalFor({{S16, S16}});
1166 else
1167 FPToI.minScalar(1, S32);
1168
1169 FPToI.minScalar(0, S32)
1170 .widenScalarToNextPow2(0, 32)
1171 .scalarize(0)
1172 .lower();
1173
1174 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1175 .clampScalar(0, S16, S64)
1176 .scalarize(0)
1177 .lower();
1178
1179 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1180 .legalFor({S16, S32})
1181 .scalarize(0)
1182 .lower();
1183
1184 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1185 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1186 .scalarize(0)
1187 .lower();
1188
1189 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1190 .clampScalar(0, S16, S64)
1191 .scalarize(0)
1192 .lower();
1193
1194 if (ST.has16BitInsts()) {
1196 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1197 .legalFor({S16, S32, S64})
1198 .clampScalar(0, S16, S64)
1199 .scalarize(0);
1200 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1202 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1203 .legalFor({S32, S64})
1204 .clampScalar(0, S32, S64)
1205 .scalarize(0);
1206 } else {
1208 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1209 .legalFor({S32})
1210 .customFor({S64})
1211 .clampScalar(0, S32, S64)
1212 .scalarize(0);
1213 }
1214
1216 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1217 .legalIf(all(isPointer(0), sameSize(0, 1)))
1218 .scalarize(0)
1219 .scalarSameSizeAs(1, 0);
1220
1222 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1223 .scalarSameSizeAs(1, 0)
1224 .scalarize(0);
1225
1226 auto &CmpBuilder =
1228 // The compare output type differs based on the register bank of the output,
1229 // so make both s1 and s32 legal.
1230 //
1231 // Scalar compares producing output in scc will be promoted to s32, as that
1232 // is the allocatable register type that will be needed for the copy from
1233 // scc. This will be promoted during RegBankSelect, and we assume something
1234 // before that won't try to use s32 result types.
1235 //
1236 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1237 // bank.
1239 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1240 .legalForCartesianProduct(
1241 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1242 if (ST.has16BitInsts()) {
1243 CmpBuilder.legalFor({{S1, S16}});
1244 }
1245
1246 CmpBuilder
1248 .clampScalar(1, S32, S64)
1249 .scalarize(0)
1250 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1251
1252 auto &FCmpBuilder =
1254 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1255
1256 if (ST.hasSALUFloatInsts())
1257 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1258
1259 FCmpBuilder
1261 .clampScalar(1, S32, S64)
1262 .scalarize(0);
1263
1264 // FIXME: fpow has a selection pattern that should move to custom lowering.
1265 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1266 if (ST.has16BitInsts())
1267 ExpOps.customFor({{S32}, {S16}});
1268 else
1269 ExpOps.customFor({S32});
1270 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1271 .scalarize(0);
1272
1274 .clampScalar(0, MinScalarFPTy, S32)
1275 .lower();
1276
1277 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1278 Log2Ops.customFor({S32});
1279 if (ST.has16BitInsts())
1280 Log2Ops.legalFor({S16});
1281 else
1282 Log2Ops.customFor({S16});
1283 Log2Ops.scalarize(0)
1284 .lower();
1285
1286 auto &LogOps =
1287 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1288 LogOps.customFor({S32, S16});
1289 LogOps.clampScalar(0, MinScalarFPTy, S32)
1290 .scalarize(0);
1291
1292 // The 64-bit versions produce 32-bit results, but only on the SALU.
1294 .legalFor({{S32, S32}, {S32, S64}})
1295 .clampScalar(0, S32, S32)
1296 .widenScalarToNextPow2(1, 32)
1297 .clampScalar(1, S32, S64)
1298 .scalarize(0)
1299 .widenScalarToNextPow2(0, 32);
1300
1301 // If no 16 bit instr is available, lower into different instructions.
1302 if (ST.has16BitInsts())
1303 getActionDefinitionsBuilder(G_IS_FPCLASS)
1304 .legalForCartesianProduct({S1}, FPTypes16)
1305 .widenScalarToNextPow2(1)
1306 .scalarize(0)
1307 .lower();
1308 else
1309 getActionDefinitionsBuilder(G_IS_FPCLASS)
1310 .legalForCartesianProduct({S1}, FPTypesBase)
1311 .lowerFor({S1, S16})
1312 .widenScalarToNextPow2(1)
1313 .scalarize(0)
1314 .lower();
1315
1316 // The hardware instructions return a different result on 0 than the generic
1317 // instructions expect. The hardware produces -1, but these produce the
1318 // bitwidth.
1319 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1320 .scalarize(0)
1321 .clampScalar(0, S32, S32)
1322 .clampScalar(1, S32, S64)
1323 .widenScalarToNextPow2(0, 32)
1324 .widenScalarToNextPow2(1, 32)
1325 .custom();
1326
1327 // The 64-bit versions produce 32-bit results, but only on the SALU.
1328 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1329 .legalFor({{S32, S32}, {S32, S64}})
1330 .customIf(scalarNarrowerThan(1, 32))
1331 .clampScalar(0, S32, S32)
1332 .clampScalar(1, S32, S64)
1333 .scalarize(0)
1334 .widenScalarToNextPow2(0, 32)
1335 .widenScalarToNextPow2(1, 32);
1336
1337 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1338 .legalFor({{S32, S32}, {S32, S64}})
1339 .clampScalar(0, S32, S32)
1340 .clampScalar(1, S32, S64)
1341 .scalarize(0)
1342 .widenScalarToNextPow2(0, 32)
1343 .widenScalarToNextPow2(1, 32);
1344
1345 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1346 // RegBankSelect.
1347 getActionDefinitionsBuilder(G_BITREVERSE)
1348 .legalFor({S32, S64})
1349 .clampScalar(0, S32, S64)
1350 .scalarize(0)
1352
1353 if (ST.has16BitInsts()) {
1355 .legalFor({S16, S32, V2S16})
1356 .clampMaxNumElementsStrict(0, S16, 2)
1357 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1358 // narrowScalar limitation.
1360 .clampScalar(0, S16, S32)
1361 .scalarize(0);
1362
1363 if (ST.hasVOP3PInsts()) {
1365 .legalFor({S32, S16, V2S16})
1366 .clampMaxNumElements(0, S16, 2)
1367 .minScalar(0, S16)
1369 .scalarize(0)
1370 .lower();
1371 if (ST.hasIntMinMax64()) {
1372 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1373 .legalFor({S32, S16, S64, V2S16})
1374 .clampMaxNumElements(0, S16, 2)
1375 .minScalar(0, S16)
1377 .scalarize(0)
1378 .lower();
1379 } else {
1380 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1381 .legalFor({S32, S16, V2S16})
1382 .clampMaxNumElements(0, S16, 2)
1383 .minScalar(0, S16)
1385 .scalarize(0)
1386 .lower();
1387 }
1388 } else {
1389 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1390 .legalFor({S32, S16})
1391 .widenScalarToNextPow2(0)
1392 .minScalar(0, S16)
1393 .scalarize(0)
1394 .lower();
1395 }
1396 } else {
1397 // TODO: Should have same legality without v_perm_b32
1399 .legalFor({S32})
1400 .lowerIf(scalarNarrowerThan(0, 32))
1401 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1402 // narrowScalar limitation.
1404 .maxScalar(0, S32)
1405 .scalarize(0)
1406 .lower();
1407
1408 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1409 .legalFor({S32})
1410 .minScalar(0, S32)
1412 .scalarize(0)
1413 .lower();
1414 }
1415
1416 getActionDefinitionsBuilder(G_INTTOPTR)
1417 // List the common cases
1418 .legalForCartesianProduct(AddrSpaces64, {S64})
1419 .legalForCartesianProduct(AddrSpaces32, {S32})
1420 .scalarize(0)
1421 // Accept any address space as long as the size matches
1422 .legalIf(sameSize(0, 1))
1424 [](const LegalityQuery &Query) {
1425 return std::pair(
1426 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1427 })
1428 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1429 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1430 });
1431
1432 getActionDefinitionsBuilder(G_PTRTOINT)
1433 // List the common cases
1434 .legalForCartesianProduct(AddrSpaces64, {S64})
1435 .legalForCartesianProduct(AddrSpaces32, {S32})
1436 .scalarize(0)
1437 // Accept any address space as long as the size matches
1438 .legalIf(sameSize(0, 1))
1440 [](const LegalityQuery &Query) {
1441 return std::pair(
1442 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1443 })
1444 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1445 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1446 });
1447
1448 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1449 .scalarize(0)
1450 .custom();
1451
1452 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1453 bool IsLoad) -> bool {
1454 const LLT DstTy = Query.Types[0];
1455
1456 // Split vector extloads.
1457 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1458
1459 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1460 return true;
1461
1462 const LLT PtrTy = Query.Types[1];
1463 unsigned AS = PtrTy.getAddressSpace();
1464 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1465 Query.MMODescrs[0].Ordering !=
1467 return true;
1468
1469 // Catch weird sized loads that don't evenly divide into the access sizes
1470 // TODO: May be able to widen depending on alignment etc.
1471 unsigned NumRegs = (MemSize + 31) / 32;
1472 if (NumRegs == 3) {
1473 if (!ST.hasDwordx3LoadStores())
1474 return true;
1475 } else {
1476 // If the alignment allows, these should have been widened.
1477 if (!isPowerOf2_32(NumRegs))
1478 return true;
1479 }
1480
1481 return false;
1482 };
1483
1484 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1485 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1486 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1487
1488 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1489 // LDS
1490 // TODO: Unsupported flat for SI.
1491
1492 for (unsigned Op : {G_LOAD, G_STORE}) {
1493 const bool IsStore = Op == G_STORE;
1494
1495 auto &Actions = getActionDefinitionsBuilder(Op);
1496 // Explicitly list some common cases.
1497 // TODO: Does this help compile time at all?
1498 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1499 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1500 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1501 {S64, GlobalPtr, S64, GlobalAlign32},
1502 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1503 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1504 {S32, GlobalPtr, S8, GlobalAlign8},
1505 {S32, GlobalPtr, S16, GlobalAlign16},
1506
1507 {S32, LocalPtr, S32, 32},
1508 {S64, LocalPtr, S64, 32},
1509 {V2S32, LocalPtr, V2S32, 32},
1510 {S32, LocalPtr, S8, 8},
1511 {S32, LocalPtr, S16, 16},
1512 {V2S16, LocalPtr, S32, 32},
1513
1514 {S32, PrivatePtr, S32, 32},
1515 {S32, PrivatePtr, S8, 8},
1516 {S32, PrivatePtr, S16, 16},
1517 {V2S16, PrivatePtr, S32, 32},
1518
1519 {S32, ConstantPtr, S32, GlobalAlign32},
1520 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1521 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1522 {S64, ConstantPtr, S64, GlobalAlign32},
1523 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1524 Actions.legalIf(
1525 [=](const LegalityQuery &Query) -> bool {
1526 return isLoadStoreLegal(ST, Query);
1527 });
1528
1529 // The custom pointers (fat pointers, buffer resources) don't work with load
1530 // and store at this level. Fat pointers should have been lowered to
1531 // intrinsics before the translation to MIR.
1532 Actions.unsupportedIf(
1533 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1534
1535 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1536 // ptrtoint. This is needed to account for the fact that we can't have i128
1537 // as a register class for SelectionDAG reasons.
1538 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1539 return hasBufferRsrcWorkaround(Query.Types[0]);
1540 });
1541
1542 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1543 // 64-bits.
1544 //
1545 // TODO: Should generalize bitcast action into coerce, which will also cover
1546 // inserting addrspacecasts.
1547 Actions.customIf(typeIs(1, Constant32Ptr));
1548
1549 // Turn any illegal element vectors into something easier to deal
1550 // with. These will ultimately produce 32-bit scalar shifts to extract the
1551 // parts anyway.
1552 //
1553 // For odd 16-bit element vectors, prefer to split those into pieces with
1554 // 16-bit vector parts.
1555 Actions.bitcastIf(
1556 [=](const LegalityQuery &Query) -> bool {
1557 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1558 Query.MMODescrs[0].MemoryTy);
1559 }, bitcastToRegisterType(0));
1560
1561 if (!IsStore) {
1562 // Widen suitably aligned loads by loading extra bytes. The standard
1563 // legalization actions can't properly express widening memory operands.
1564 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1565 return shouldWidenLoad(ST, Query, G_LOAD);
1566 });
1567 }
1568
1569 // FIXME: load/store narrowing should be moved to lower action
1570 Actions
1571 .narrowScalarIf(
1572 [=](const LegalityQuery &Query) -> bool {
1573 return !Query.Types[0].isVector() &&
1574 needToSplitMemOp(Query, Op == G_LOAD);
1575 },
1576 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1577 const LLT DstTy = Query.Types[0];
1578 const LLT PtrTy = Query.Types[1];
1579
1580 const unsigned DstSize = DstTy.getSizeInBits();
1581 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1582
1583 // Split extloads.
1584 if (DstSize > MemSize)
1585 return std::pair(0, LLT::scalar(MemSize));
1586
1587 unsigned MaxSize = maxSizeForAddrSpace(
1588 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1589 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1590 if (MemSize > MaxSize)
1591 return std::pair(0, LLT::scalar(MaxSize));
1592
1593 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1594 return std::pair(0, LLT::scalar(Align));
1595 })
1596 .fewerElementsIf(
1597 [=](const LegalityQuery &Query) -> bool {
1598 return Query.Types[0].isVector() &&
1599 needToSplitMemOp(Query, Op == G_LOAD);
1600 },
1601 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1602 const LLT DstTy = Query.Types[0];
1603 const LLT PtrTy = Query.Types[1];
1604
1605 LLT EltTy = DstTy.getElementType();
1606 unsigned MaxSize = maxSizeForAddrSpace(
1607 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1608 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1609
1610 // FIXME: Handle widened to power of 2 results better. This ends
1611 // up scalarizing.
1612 // FIXME: 3 element stores scalarized on SI
1613
1614 // Split if it's too large for the address space.
1615 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1616 if (MemSize > MaxSize) {
1617 unsigned NumElts = DstTy.getNumElements();
1618 unsigned EltSize = EltTy.getSizeInBits();
1619
1620 if (MaxSize % EltSize == 0) {
1621 return std::pair(
1623 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1624 }
1625
1626 unsigned NumPieces = MemSize / MaxSize;
1627
1628 // FIXME: Refine when odd breakdowns handled
1629 // The scalars will need to be re-legalized.
1630 if (NumPieces == 1 || NumPieces >= NumElts ||
1631 NumElts % NumPieces != 0)
1632 return std::pair(0, EltTy);
1633
1634 return std::pair(0,
1635 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1636 }
1637
1638 // FIXME: We could probably handle weird extending loads better.
1639 if (DstTy.getSizeInBits() > MemSize)
1640 return std::pair(0, EltTy);
1641
1642 unsigned EltSize = EltTy.getSizeInBits();
1643 unsigned DstSize = DstTy.getSizeInBits();
1644 if (!isPowerOf2_32(DstSize)) {
1645 // We're probably decomposing an odd sized store. Try to split
1646 // to the widest type. TODO: Account for alignment. As-is it
1647 // should be OK, since the new parts will be further legalized.
1648 unsigned FloorSize = llvm::bit_floor(DstSize);
1649 return std::pair(
1651 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1652 }
1653
1654 // May need relegalization for the scalars.
1655 return std::pair(0, EltTy);
1656 })
1657 .minScalar(0, S32)
1658 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1660 .widenScalarToNextPow2(0)
1661 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1662 .lower();
1663 }
1664
1665 // FIXME: Unaligned accesses not lowered.
1666 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1667 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1668 {S32, GlobalPtr, S16, 2 * 8},
1669 {S32, LocalPtr, S8, 8},
1670 {S32, LocalPtr, S16, 16},
1671 {S32, PrivatePtr, S8, 8},
1672 {S32, PrivatePtr, S16, 16},
1673 {S32, ConstantPtr, S8, 8},
1674 {S32, ConstantPtr, S16, 2 * 8}})
1675 .legalIf(
1676 [=](const LegalityQuery &Query) -> bool {
1677 return isLoadStoreLegal(ST, Query);
1678 });
1679
1680 if (ST.hasFlatAddressSpace()) {
1681 ExtLoads.legalForTypesWithMemDesc(
1682 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1683 }
1684
1685 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1686 // 64-bits.
1687 //
1688 // TODO: Should generalize bitcast action into coerce, which will also cover
1689 // inserting addrspacecasts.
1690 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1691
1692 ExtLoads.clampScalar(0, S32, S32)
1694 .lower();
1695
1696 auto &Atomics = getActionDefinitionsBuilder(
1697 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1698 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1699 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1700 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1701 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1702 {S64, GlobalPtr}, {S64, LocalPtr},
1703 {S32, RegionPtr}, {S64, RegionPtr}});
1704 if (ST.hasFlatAddressSpace()) {
1705 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1706 }
1707
1708 // TODO: v2bf16 operations, and fat buffer pointer support.
1709 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1710 if (ST.hasLDSFPAtomicAddF32()) {
1711 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1712 if (ST.hasLdsAtomicAddF64())
1713 Atomic.legalFor({{S64, LocalPtr}});
1714 if (ST.hasAtomicDsPkAdd16Insts())
1715 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1716 }
1717 if (ST.hasAtomicFaddInsts())
1718 Atomic.legalFor({{S32, GlobalPtr}});
1719 if (ST.hasFlatAtomicFaddF32Inst())
1720 Atomic.legalFor({{S32, FlatPtr}});
1721
1722 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1723 // These are legal with some caveats, and should have undergone expansion in
1724 // the IR in most situations
1725 // TODO: Move atomic expansion into legalizer
1726 Atomic.legalFor({
1727 {S32, GlobalPtr},
1728 {S64, GlobalPtr},
1729 {S64, FlatPtr}
1730 });
1731 }
1732
1733 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1734 ST.hasAtomicBufferGlobalPkAddF16Insts())
1735 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1736 if (ST.hasAtomicGlobalPkAddBF16Inst())
1737 Atomic.legalFor({{V2BF16, GlobalPtr}});
1738 if (ST.hasAtomicFlatPkAdd16Insts())
1739 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1740
1741
1742 // Most of the legalization work here is done by AtomicExpand. We could
1743 // probably use a simpler legality rule that just assumes anything is OK.
1744 auto &AtomicFMinFMax =
1745 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1746 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1747
1748 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1749 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1750 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1751 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1752 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1753 AtomicFMinFMax.legalFor({F32, FlatPtr});
1754 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1755 AtomicFMinFMax.legalFor({F64, FlatPtr});
1756
1757 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1758 // demarshalling
1759 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1760 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1761 {S32, FlatPtr}, {S64, FlatPtr}})
1762 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1763 {S32, RegionPtr}, {S64, RegionPtr}});
1764 // TODO: Pointer types, any 32-bit or 64-bit vector
1765
1766 // Condition should be s32 for scalar, s1 for vector.
1769 LocalPtr, FlatPtr, PrivatePtr,
1770 LLT::fixed_vector(2, LocalPtr),
1771 LLT::fixed_vector(2, PrivatePtr)},
1772 {S1, S32})
1773 .clampScalar(0, S16, S64)
1774 .scalarize(1)
1777 .clampMaxNumElements(0, S32, 2)
1778 .clampMaxNumElements(0, LocalPtr, 2)
1779 .clampMaxNumElements(0, PrivatePtr, 2)
1780 .scalarize(0)
1782 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1783
1784 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1785 // be more flexible with the shift amount type.
1786 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1787 .legalFor({{S32, S32}, {S64, S32}});
1788 if (ST.has16BitInsts()) {
1789 if (ST.hasVOP3PInsts()) {
1790 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1791 .clampMaxNumElements(0, S16, 2);
1792 } else
1793 Shifts.legalFor({{S16, S16}});
1794
1795 // TODO: Support 16-bit shift amounts for all types
1796 Shifts.widenScalarIf(
1797 [=](const LegalityQuery &Query) {
1798 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1799 // 32-bit amount.
1800 const LLT ValTy = Query.Types[0];
1801 const LLT AmountTy = Query.Types[1];
1802 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1803 AmountTy.getSizeInBits() < 16;
1804 }, changeTo(1, S16));
1805 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1806 Shifts.clampScalar(1, S32, S32);
1807 Shifts.widenScalarToNextPow2(0, 16);
1808 Shifts.clampScalar(0, S16, S64);
1809
1810 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1811 .minScalar(0, S16)
1812 .scalarize(0)
1813 .lower();
1814 } else {
1815 // Make sure we legalize the shift amount type first, as the general
1816 // expansion for the shifted type will produce much worse code if it hasn't
1817 // been truncated already.
1818 Shifts.clampScalar(1, S32, S32);
1819 Shifts.widenScalarToNextPow2(0, 32);
1820 Shifts.clampScalar(0, S32, S64);
1821
1822 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1823 .minScalar(0, S32)
1824 .scalarize(0)
1825 .lower();
1826 }
1827 Shifts.scalarize(0);
1828
1829 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1830 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1831 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1832 unsigned IdxTypeIdx = 2;
1833
1835 .customIf([=](const LegalityQuery &Query) {
1836 const LLT EltTy = Query.Types[EltTypeIdx];
1837 const LLT VecTy = Query.Types[VecTypeIdx];
1838 const LLT IdxTy = Query.Types[IdxTypeIdx];
1839 const unsigned EltSize = EltTy.getSizeInBits();
1840 const bool isLegalVecType =
1842 // Address space 8 pointers are 128-bit wide values, but the logic
1843 // below will try to bitcast them to 2N x s64, which will fail.
1844 // Therefore, as an intermediate step, wrap extracts/insertions from a
1845 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1846 // extraction result) in order to produce a vector operation that can
1847 // be handled by the logic below.
1848 if (EltTy.isPointer() && EltSize > 64)
1849 return true;
1850 return (EltSize == 32 || EltSize == 64) &&
1851 VecTy.getSizeInBits() % 32 == 0 &&
1852 VecTy.getSizeInBits() <= MaxRegisterSize &&
1853 IdxTy.getSizeInBits() == 32 &&
1854 isLegalVecType;
1855 })
1856 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1857 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1858 bitcastToVectorElement32(VecTypeIdx))
1859 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1860 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1861 scalarOrEltWiderThan(VecTypeIdx, 64)),
1862 [=](const LegalityQuery &Query) {
1863 // For > 64-bit element types, try to turn this into a
1864 // 64-bit element vector since we may be able to do better
1865 // indexing if this is scalar. If not, fall back to 32.
1866 const LLT EltTy = Query.Types[EltTypeIdx];
1867 const LLT VecTy = Query.Types[VecTypeIdx];
1868 const unsigned DstEltSize = EltTy.getSizeInBits();
1869 const unsigned VecSize = VecTy.getSizeInBits();
1870
1871 const unsigned TargetEltSize =
1872 DstEltSize % 64 == 0 ? 64 : 32;
1873 return std::pair(VecTypeIdx,
1874 LLT::fixed_vector(VecSize / TargetEltSize,
1875 TargetEltSize));
1876 })
1877 .clampScalar(EltTypeIdx, S32, S64)
1878 .clampScalar(VecTypeIdx, S32, S64)
1879 .clampScalar(IdxTypeIdx, S32, S32)
1880 .clampMaxNumElements(VecTypeIdx, S32, 32)
1881 // TODO: Clamp elements for 64-bit vectors?
1882 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1884 // It should only be necessary with variable indexes.
1885 // As a last resort, lower to the stack
1886 .lower();
1887 }
1888
1889 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1890 .unsupportedIf([=](const LegalityQuery &Query) {
1891 const LLT &EltTy = Query.Types[1].getElementType();
1892 return Query.Types[0] != EltTy;
1893 });
1894
1895 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1896 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1897 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1898
1899 // FIXME: Doesn't handle extract of illegal sizes.
1901 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1902 .lowerIf([=](const LegalityQuery &Query) {
1903 // Sub-vector(or single element) insert and extract.
1904 // TODO: verify immediate offset here since lower only works with
1905 // whole elements.
1906 const LLT BigTy = Query.Types[BigTyIdx];
1907 return BigTy.isVector();
1908 })
1909 // FIXME: Multiples of 16 should not be legal.
1910 .legalIf([=](const LegalityQuery &Query) {
1911 const LLT BigTy = Query.Types[BigTyIdx];
1912 const LLT LitTy = Query.Types[LitTyIdx];
1913 return (BigTy.getSizeInBits() % 32 == 0) &&
1914 (LitTy.getSizeInBits() % 16 == 0);
1915 })
1916 .widenScalarIf(
1917 [=](const LegalityQuery &Query) {
1918 const LLT BigTy = Query.Types[BigTyIdx];
1919 return (BigTy.getScalarSizeInBits() < 16);
1920 },
1922 .widenScalarIf(
1923 [=](const LegalityQuery &Query) {
1924 const LLT LitTy = Query.Types[LitTyIdx];
1925 return (LitTy.getScalarSizeInBits() < 16);
1926 },
1928 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1929 .widenScalarToNextPow2(BigTyIdx, 32);
1930
1931 }
1932
1933 auto &BuildVector =
1934 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1936 .legalForCartesianProduct(AllS64Vectors, {S64})
1937 .clampNumElements(0, V16S32, V32S32)
1942
1943 if (ST.hasScalarPackInsts()) {
1944 BuildVector
1945 // FIXME: Should probably widen s1 vectors straight to s32
1946 .minScalarOrElt(0, S16)
1947 .minScalar(1, S16);
1948
1949 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1950 .legalFor({V2S16, S32})
1951 .lower();
1952 } else {
1953 BuildVector.customFor({V2S16, S16});
1954 BuildVector.minScalarOrElt(0, S32);
1955
1956 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1957 .customFor({V2S16, S32})
1958 .lower();
1959 }
1960
1961 BuildVector.legalIf(isRegisterType(ST, 0));
1962
1963 // FIXME: Clamp maximum size
1964 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1965 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
1966 .clampMaxNumElements(0, S32, 32)
1967 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1968 .clampMaxNumElements(0, S16, 64);
1969
1970 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1971
1972 // Merge/Unmerge
1973 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1974 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1975 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1976
1977 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1978 const LLT Ty = Query.Types[TypeIdx];
1979 if (Ty.isVector()) {
1980 const LLT &EltTy = Ty.getElementType();
1981 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1982 return true;
1984 return true;
1985 }
1986 return false;
1987 };
1988
1989 auto &Builder =
1991 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
1992 .lowerFor({{S16, V2S16}})
1993 .lowerIf([=](const LegalityQuery &Query) {
1994 const LLT BigTy = Query.Types[BigTyIdx];
1995 return BigTy.getSizeInBits() == 32;
1996 })
1997 // Try to widen to s16 first for small types.
1998 // TODO: Only do this on targets with legal s16 shifts
1999 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2000 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2002 oneMoreElement(BigTyIdx))
2004 elementTypeIs(1, S16)),
2005 changeTo(1, V2S16))
2006 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2007 // not worth considering the multiples of 64 since 2*192 and 2*384
2008 // are not valid.
2009 .clampScalar(LitTyIdx, S32, S512)
2010 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2011 // Break up vectors with weird elements into scalars
2013 [=](const LegalityQuery &Query) {
2014 return notValidElt(Query, LitTyIdx);
2015 },
2016 scalarize(0))
2017 .fewerElementsIf(
2018 [=](const LegalityQuery &Query) {
2019 return notValidElt(Query, BigTyIdx);
2020 },
2021 scalarize(1))
2022 .clampScalar(BigTyIdx, S32, MaxScalar);
2023
2024 if (Op == G_MERGE_VALUES) {
2025 Builder.widenScalarIf(
2026 // TODO: Use 16-bit shifts if legal for 8-bit values?
2027 [=](const LegalityQuery &Query) {
2028 const LLT Ty = Query.Types[LitTyIdx];
2029 return Ty.getSizeInBits() < 32;
2030 },
2031 changeTo(LitTyIdx, S32));
2032 }
2033
2034 Builder.widenScalarIf(
2035 [=](const LegalityQuery &Query) {
2036 const LLT Ty = Query.Types[BigTyIdx];
2037 return Ty.getSizeInBits() % 16 != 0;
2038 },
2039 [=](const LegalityQuery &Query) {
2040 // Pick the next power of 2, or a multiple of 64 over 128.
2041 // Whichever is smaller.
2042 const LLT &Ty = Query.Types[BigTyIdx];
2043 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2044 if (NewSizeInBits >= 256) {
2045 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2046 if (RoundedTo < NewSizeInBits)
2047 NewSizeInBits = RoundedTo;
2048 }
2049 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2050 })
2051 // Any vectors left are the wrong size. Scalarize them.
2052 .scalarize(0)
2053 .scalarize(1);
2054 }
2055
2056 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2057 // RegBankSelect.
2058 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2059 .legalFor({{S32}, {S64}})
2060 .clampScalar(0, S32, S64);
2061
2062 if (ST.hasVOP3PInsts()) {
2063 SextInReg.lowerFor({{V2S16}})
2064 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2065 // get more vector shift opportunities, since we'll get those when
2066 // expanded.
2067 .clampMaxNumElementsStrict(0, S16, 2);
2068 } else if (ST.has16BitInsts()) {
2069 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2070 } else {
2071 // Prefer to promote to s32 before lowering if we don't have 16-bit
2072 // shifts. This avoid a lot of intermediate truncate and extend operations.
2073 SextInReg.lowerFor({{S32}, {S64}});
2074 }
2075
2076 SextInReg
2077 .scalarize(0)
2078 .clampScalar(0, S32, S64)
2079 .lower();
2080
2081 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2082 .scalarize(0)
2083 .lower();
2084
2085 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2086 FSHRActionDefs.legalFor({{S32, S32}})
2087 .clampMaxNumElementsStrict(0, S16, 2);
2088 if (ST.hasVOP3PInsts())
2089 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2090 FSHRActionDefs.scalarize(0).lower();
2091
2092 if (ST.hasVOP3PInsts()) {
2094 .lowerFor({{V2S16, V2S16}})
2095 .clampMaxNumElementsStrict(0, S16, 2)
2096 .scalarize(0)
2097 .lower();
2098 } else {
2100 .scalarize(0)
2101 .lower();
2102 }
2103
2104 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2105 .legalFor({S64});
2106
2107 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2108
2110 .alwaysLegal();
2111
2112 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2113 .scalarize(0)
2114 .minScalar(0, S32)
2115 .lower();
2116
2117 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2118 .legalFor({{S32, S32}, {S64, S32}})
2119 .clampScalar(1, S32, S32)
2120 .clampScalar(0, S32, S64)
2122 .scalarize(0);
2123
2125 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2126 G_FCOPYSIGN,
2127
2128 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2129 G_READ_REGISTER, G_WRITE_REGISTER,
2130
2131 G_SADDO, G_SSUBO})
2132 .lower();
2133
2134 if (ST.hasIEEEMinimumMaximumInsts()) {
2135 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2136 .legalFor(FPTypesPK16)
2137 .clampMaxNumElements(0, S16, 2)
2138 .scalarize(0);
2139 } else {
2140 // TODO: Implement
2141 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2142 }
2143
2144 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2145 .lower();
2146
2147 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2148
2149 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2150 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2151 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2152 .unsupported();
2153
2155
2157 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2158 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2159 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2160 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2161 .legalFor(AllVectors)
2162 .scalarize(1)
2163 .lower();
2164
2166 verify(*ST.getInstrInfo());
2167}
2168
2171 LostDebugLocObserver &LocObserver) const {
2172 MachineIRBuilder &B = Helper.MIRBuilder;
2173 MachineRegisterInfo &MRI = *B.getMRI();
2174
2175 switch (MI.getOpcode()) {
2176 case TargetOpcode::G_ADDRSPACE_CAST:
2177 return legalizeAddrSpaceCast(MI, MRI, B);
2178 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2179 return legalizeFroundeven(MI, MRI, B);
2180 case TargetOpcode::G_FCEIL:
2181 return legalizeFceil(MI, MRI, B);
2182 case TargetOpcode::G_FREM:
2183 return legalizeFrem(MI, MRI, B);
2184 case TargetOpcode::G_INTRINSIC_TRUNC:
2185 return legalizeIntrinsicTrunc(MI, MRI, B);
2186 case TargetOpcode::G_SITOFP:
2187 return legalizeITOFP(MI, MRI, B, true);
2188 case TargetOpcode::G_UITOFP:
2189 return legalizeITOFP(MI, MRI, B, false);
2190 case TargetOpcode::G_FPTOSI:
2191 return legalizeFPTOI(MI, MRI, B, true);
2192 case TargetOpcode::G_FPTOUI:
2193 return legalizeFPTOI(MI, MRI, B, false);
2194 case TargetOpcode::G_FMINNUM:
2195 case TargetOpcode::G_FMAXNUM:
2196 case TargetOpcode::G_FMINIMUMNUM:
2197 case TargetOpcode::G_FMAXIMUMNUM:
2198 case TargetOpcode::G_FMINNUM_IEEE:
2199 case TargetOpcode::G_FMAXNUM_IEEE:
2200 return legalizeMinNumMaxNum(Helper, MI);
2201 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2202 return legalizeExtractVectorElt(MI, MRI, B);
2203 case TargetOpcode::G_INSERT_VECTOR_ELT:
2204 return legalizeInsertVectorElt(MI, MRI, B);
2205 case TargetOpcode::G_FSIN:
2206 case TargetOpcode::G_FCOS:
2207 return legalizeSinCos(MI, MRI, B);
2208 case TargetOpcode::G_GLOBAL_VALUE:
2209 return legalizeGlobalValue(MI, MRI, B);
2210 case TargetOpcode::G_LOAD:
2211 case TargetOpcode::G_SEXTLOAD:
2212 case TargetOpcode::G_ZEXTLOAD:
2213 return legalizeLoad(Helper, MI);
2214 case TargetOpcode::G_STORE:
2215 return legalizeStore(Helper, MI);
2216 case TargetOpcode::G_FMAD:
2217 return legalizeFMad(MI, MRI, B);
2218 case TargetOpcode::G_FDIV:
2219 return legalizeFDIV(MI, MRI, B);
2220 case TargetOpcode::G_FFREXP:
2221 return legalizeFFREXP(MI, MRI, B);
2222 case TargetOpcode::G_FSQRT:
2223 return legalizeFSQRT(MI, MRI, B);
2224 case TargetOpcode::G_UDIV:
2225 case TargetOpcode::G_UREM:
2226 case TargetOpcode::G_UDIVREM:
2227 return legalizeUnsignedDIV_REM(MI, MRI, B);
2228 case TargetOpcode::G_SDIV:
2229 case TargetOpcode::G_SREM:
2230 case TargetOpcode::G_SDIVREM:
2231 return legalizeSignedDIV_REM(MI, MRI, B);
2232 case TargetOpcode::G_ATOMIC_CMPXCHG:
2233 return legalizeAtomicCmpXChg(MI, MRI, B);
2234 case TargetOpcode::G_FLOG2:
2235 return legalizeFlog2(MI, B);
2236 case TargetOpcode::G_FLOG:
2237 case TargetOpcode::G_FLOG10:
2238 return legalizeFlogCommon(MI, B);
2239 case TargetOpcode::G_FEXP2:
2240 return legalizeFExp2(MI, B);
2241 case TargetOpcode::G_FEXP:
2242 case TargetOpcode::G_FEXP10:
2243 return legalizeFExp(MI, B);
2244 case TargetOpcode::G_FPOW:
2245 return legalizeFPow(MI, B);
2246 case TargetOpcode::G_FFLOOR:
2247 return legalizeFFloor(MI, MRI, B);
2248 case TargetOpcode::G_BUILD_VECTOR:
2249 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2250 return legalizeBuildVector(MI, MRI, B);
2251 case TargetOpcode::G_MUL:
2252 return legalizeMul(Helper, MI);
2253 case TargetOpcode::G_CTLZ:
2254 case TargetOpcode::G_CTTZ:
2255 return legalizeCTLZ_CTTZ(MI, MRI, B);
2256 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2257 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2258 case TargetOpcode::G_STACKSAVE:
2259 return legalizeStackSave(MI, B);
2260 case TargetOpcode::G_GET_FPENV:
2261 return legalizeGetFPEnv(MI, MRI, B);
2262 case TargetOpcode::G_SET_FPENV:
2263 return legalizeSetFPEnv(MI, MRI, B);
2264 case TargetOpcode::G_TRAP:
2265 return legalizeTrap(MI, MRI, B);
2266 case TargetOpcode::G_DEBUGTRAP:
2267 return legalizeDebugTrap(MI, MRI, B);
2268 default:
2269 return false;
2270 }
2271
2272 llvm_unreachable("expected switch to return");
2273}
2274
2276 unsigned AS,
2278 MachineIRBuilder &B) const {
2279 MachineFunction &MF = B.getMF();
2280 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2281 const LLT S32 = LLT::scalar(32);
2282 const LLT S64 = LLT::scalar(64);
2283
2285
2286 if (ST.hasApertureRegs()) {
2287 // Note: this register is somewhat broken. When used as a 32-bit operand,
2288 // it only returns zeroes. The real value is in the upper 32 bits.
2289 // Thus, we must emit extract the high 32 bits.
2290 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2291 ? AMDGPU::SRC_SHARED_BASE
2292 : AMDGPU::SRC_PRIVATE_BASE;
2293 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2294 !ST.hasGloballyAddressableScratch()) &&
2295 "Cannot use src_private_base with globally addressable scratch!");
2296 Register Dst = MRI.createGenericVirtualRegister(S64);
2297 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2298 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2299 return B.buildUnmerge(S32, Dst).getReg(1);
2300 }
2301
2302 // TODO: can we be smarter about machine pointer info?
2304 Register LoadAddr = MRI.createGenericVirtualRegister(
2306 // For code object version 5, private_base and shared_base are passed through
2307 // implicit kernargs.
2314 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2315
2316 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2318
2319 if (!loadInputValue(KernargPtrReg, B,
2321 return Register();
2322
2324 PtrInfo,
2328
2329 // Pointer address
2330 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2331 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2332 // Load address
2333 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2334 }
2335
2336 Register QueuePtr = MRI.createGenericVirtualRegister(
2338
2340 return Register();
2341
2342 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2343 // private_segment_aperture_base_hi.
2344 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2345
2347 PtrInfo,
2350 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2351
2352 B.buildObjectPtrOffset(
2353 LoadAddr, QueuePtr,
2354 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2355 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2356}
2357
2358/// Return true if the value is a known valid address, such that a null check is
2359/// not necessary.
2361 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2362 MachineInstr *Def = MRI.getVRegDef(Val);
2363 switch (Def->getOpcode()) {
2364 case AMDGPU::G_FRAME_INDEX:
2365 case AMDGPU::G_GLOBAL_VALUE:
2366 case AMDGPU::G_BLOCK_ADDR:
2367 return true;
2368 case AMDGPU::G_CONSTANT: {
2369 const ConstantInt *CI = Def->getOperand(1).getCImm();
2370 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2371 }
2372 default:
2373 return false;
2374 }
2375
2376 return false;
2377}
2378
2381 MachineIRBuilder &B) const {
2382 MachineFunction &MF = B.getMF();
2383
2384 // MI can either be a G_ADDRSPACE_CAST or a
2385 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2386 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2387 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2388 Intrinsic::amdgcn_addrspacecast_nonnull));
2389
2390 const LLT S32 = LLT::scalar(32);
2391 Register Dst = MI.getOperand(0).getReg();
2392 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2393 : MI.getOperand(1).getReg();
2394 LLT DstTy = MRI.getType(Dst);
2395 LLT SrcTy = MRI.getType(Src);
2396 unsigned DestAS = DstTy.getAddressSpace();
2397 unsigned SrcAS = SrcTy.getAddressSpace();
2398
2399 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2400 // vector element.
2401 assert(!DstTy.isVector());
2402
2403 const AMDGPUTargetMachine &TM
2404 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2405
2406 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2407 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2408 return true;
2409 }
2410
2411 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2412 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2413 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2414 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2415 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2416 ST.hasGloballyAddressableScratch()) {
2417 // flat -> private with globally addressable scratch: subtract
2418 // src_flat_scratch_base_lo.
2419 const LLT S32 = LLT::scalar(32);
2420 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2421 Register FlatScratchBaseLo =
2422 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2423 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2424 .getReg(0);
2425 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2426 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2427 return B.buildIntToPtr(Dst, Sub).getReg(0);
2428 }
2429
2430 // Extract low 32-bits of the pointer.
2431 return B.buildExtract(Dst, Src, 0).getReg(0);
2432 };
2433
2434 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2435 // G_ADDRSPACE_CAST we need to guess.
2436 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2437 castFlatToLocalOrPrivate(Dst);
2438 MI.eraseFromParent();
2439 return true;
2440 }
2441
2442 unsigned NullVal = TM.getNullPointerValue(DestAS);
2443
2444 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2445 auto FlatNull = B.buildConstant(SrcTy, 0);
2446
2447 // Extract low 32-bits of the pointer.
2448 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2449
2450 auto CmpRes =
2451 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2452 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2453
2454 MI.eraseFromParent();
2455 return true;
2456 }
2457
2458 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2459 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2460 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2461 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2462 // Coerce the type of the low half of the result so we can use
2463 // merge_values.
2464 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2465
2466 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2467 ST.hasGloballyAddressableScratch()) {
2468 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2469 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2470 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2471 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2472 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2473 .addUse(AllOnes)
2474 .addUse(ThreadID)
2475 .getReg(0);
2476 if (ST.isWave64()) {
2477 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2478 .addUse(AllOnes)
2479 .addUse(ThreadID)
2480 .getReg(0);
2481 }
2482 Register ShAmt =
2483 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2484 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2485 Register CvtPtr =
2486 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2487 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2488 // 64-bit hi:lo value.
2489 Register FlatScratchBase =
2490 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2491 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2492 .getReg(0);
2493 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2494 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2495 }
2496
2497 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2498 if (!ApertureReg.isValid())
2499 return false;
2500
2501 // TODO: Should we allow mismatched types but matching sizes in merges to
2502 // avoid the ptrtoint?
2503 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2504 };
2505
2506 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2507 // G_ADDRSPACE_CAST we need to guess.
2508 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2509 castLocalOrPrivateToFlat(Dst);
2510 MI.eraseFromParent();
2511 return true;
2512 }
2513
2514 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2515
2516 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2517 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2518
2519 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2520 SegmentNull.getReg(0));
2521
2522 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2523
2524 MI.eraseFromParent();
2525 return true;
2526 }
2527
2528 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2529 SrcTy.getSizeInBits() == 64) {
2530 // Truncate.
2531 B.buildExtract(Dst, Src, 0);
2532 MI.eraseFromParent();
2533 return true;
2534 }
2535
2536 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2537 DstTy.getSizeInBits() == 64) {
2539 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2540 auto PtrLo = B.buildPtrToInt(S32, Src);
2541 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2542 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2543 MI.eraseFromParent();
2544 return true;
2545 }
2546
2547 // Invalid casts are poison.
2548 // TODO: Should return poison
2549 B.buildUndef(Dst);
2550 MI.eraseFromParent();
2551 return true;
2552}
2553
2556 MachineIRBuilder &B) const {
2557 Register Src = MI.getOperand(1).getReg();
2558 LLT Ty = MRI.getType(Src);
2559 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2560
2561 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2562 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2563
2564 auto C1 = B.buildFConstant(Ty, C1Val);
2565 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2566
2567 // TODO: Should this propagate fast-math-flags?
2568 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2569 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2570
2571 auto C2 = B.buildFConstant(Ty, C2Val);
2572 auto Fabs = B.buildFAbs(Ty, Src);
2573
2574 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2575 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2576 MI.eraseFromParent();
2577 return true;
2578}
2579
2582 MachineIRBuilder &B) const {
2583
2584 const LLT S1 = LLT::scalar(1);
2585 const LLT S64 = LLT::scalar(64);
2586
2587 Register Src = MI.getOperand(1).getReg();
2588 assert(MRI.getType(Src) == S64);
2589
2590 // result = trunc(src)
2591 // if (src > 0.0 && src != result)
2592 // result += 1.0
2593
2594 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2595
2596 const auto Zero = B.buildFConstant(S64, 0.0);
2597 const auto One = B.buildFConstant(S64, 1.0);
2598 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2599 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2600 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2601 auto Add = B.buildSelect(S64, And, One, Zero);
2602
2603 // TODO: Should this propagate fast-math-flags?
2604 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2605 MI.eraseFromParent();
2606 return true;
2607}
2608
2611 MachineIRBuilder &B) const {
2612 Register DstReg = MI.getOperand(0).getReg();
2613 Register Src0Reg = MI.getOperand(1).getReg();
2614 Register Src1Reg = MI.getOperand(2).getReg();
2615 auto Flags = MI.getFlags();
2616 LLT Ty = MRI.getType(DstReg);
2617
2618 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2619 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2620 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2621 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2622 MI.eraseFromParent();
2623 return true;
2624}
2625
2628 const unsigned FractBits = 52;
2629 const unsigned ExpBits = 11;
2630 LLT S32 = LLT::scalar(32);
2631
2632 auto Const0 = B.buildConstant(S32, FractBits - 32);
2633 auto Const1 = B.buildConstant(S32, ExpBits);
2634
2635 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2636 .addUse(Hi)
2637 .addUse(Const0.getReg(0))
2638 .addUse(Const1.getReg(0));
2639
2640 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2641}
2642
2645 MachineIRBuilder &B) const {
2646 const LLT S1 = LLT::scalar(1);
2647 const LLT S32 = LLT::scalar(32);
2648 const LLT S64 = LLT::scalar(64);
2649
2650 Register Src = MI.getOperand(1).getReg();
2651 assert(MRI.getType(Src) == S64);
2652
2653 // TODO: Should this use extract since the low half is unused?
2654 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2655 Register Hi = Unmerge.getReg(1);
2656
2657 // Extract the upper half, since this is where we will find the sign and
2658 // exponent.
2659 auto Exp = extractF64Exponent(Hi, B);
2660
2661 const unsigned FractBits = 52;
2662
2663 // Extract the sign bit.
2664 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2665 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2666
2667 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2668
2669 const auto Zero32 = B.buildConstant(S32, 0);
2670
2671 // Extend back to 64-bits.
2672 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2673
2674 auto Shr = B.buildAShr(S64, FractMask, Exp);
2675 auto Not = B.buildNot(S64, Shr);
2676 auto Tmp0 = B.buildAnd(S64, Src, Not);
2677 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2678
2679 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2680 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2681
2682 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2683 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2684 MI.eraseFromParent();
2685 return true;
2686}
2687
2690 MachineIRBuilder &B, bool Signed) const {
2691
2692 Register Dst = MI.getOperand(0).getReg();
2693 Register Src = MI.getOperand(1).getReg();
2694
2695 const LLT S64 = LLT::scalar(64);
2696 const LLT S32 = LLT::scalar(32);
2697
2698 assert(MRI.getType(Src) == S64);
2699
2700 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2701 auto ThirtyTwo = B.buildConstant(S32, 32);
2702
2703 if (MRI.getType(Dst) == S64) {
2704 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2705 : B.buildUITOFP(S64, Unmerge.getReg(1));
2706
2707 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2708 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2709
2710 // TODO: Should this propagate fast-math-flags?
2711 B.buildFAdd(Dst, LdExp, CvtLo);
2712 MI.eraseFromParent();
2713 return true;
2714 }
2715
2716 assert(MRI.getType(Dst) == S32);
2717
2718 auto One = B.buildConstant(S32, 1);
2719
2720 MachineInstrBuilder ShAmt;
2721 if (Signed) {
2722 auto ThirtyOne = B.buildConstant(S32, 31);
2723 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2724 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2725 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2726 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2727 .addUse(Unmerge.getReg(1));
2728 auto LS2 = B.buildSub(S32, LS, One);
2729 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2730 } else
2731 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2732 auto Norm = B.buildShl(S64, Src, ShAmt);
2733 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2734 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2735 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2736 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2737 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2738 B.buildFLdexp(Dst, FVal, Scale);
2739 MI.eraseFromParent();
2740 return true;
2741}
2742
2743// TODO: Copied from DAG implementation. Verify logic and document how this
2744// actually works.
2748 bool Signed) const {
2749
2750 Register Dst = MI.getOperand(0).getReg();
2751 Register Src = MI.getOperand(1).getReg();
2752
2753 const LLT S64 = LLT::scalar(64);
2754 const LLT S32 = LLT::scalar(32);
2755
2756 const LLT SrcLT = MRI.getType(Src);
2757 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2758
2759 unsigned Flags = MI.getFlags();
2760
2761 // The basic idea of converting a floating point number into a pair of 32-bit
2762 // integers is illustrated as follows:
2763 //
2764 // tf := trunc(val);
2765 // hif := floor(tf * 2^-32);
2766 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2767 // hi := fptoi(hif);
2768 // lo := fptoi(lof);
2769 //
2770 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2772 if (Signed && SrcLT == S32) {
2773 // However, a 32-bit floating point number has only 23 bits mantissa and
2774 // it's not enough to hold all the significant bits of `lof` if val is
2775 // negative. To avoid the loss of precision, We need to take the absolute
2776 // value after truncating and flip the result back based on the original
2777 // signedness.
2778 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2779 Trunc = B.buildFAbs(S32, Trunc, Flags);
2780 }
2781 MachineInstrBuilder K0, K1;
2782 if (SrcLT == S64) {
2783 K0 = B.buildFConstant(
2784 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2785 K1 = B.buildFConstant(
2786 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2787 } else {
2788 K0 = B.buildFConstant(
2789 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2790 K1 = B.buildFConstant(
2791 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2792 }
2793
2794 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2795 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2796 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2797
2798 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2799 : B.buildFPTOUI(S32, FloorMul);
2800 auto Lo = B.buildFPTOUI(S32, Fma);
2801
2802 if (Signed && SrcLT == S32) {
2803 // Flip the result based on the signedness, which is either all 0s or 1s.
2804 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2805 // r := xor({lo, hi}, sign) - sign;
2806 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2807 Sign);
2808 } else
2809 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2810 MI.eraseFromParent();
2811
2812 return true;
2813}
2814
2816 MachineInstr &MI) const {
2817 MachineFunction &MF = Helper.MIRBuilder.getMF();
2819
2820 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2821 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2822
2823 // With ieee_mode disabled, the instructions have the correct behavior
2824 // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
2825 //
2826 // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
2827 // enabled.
2828 if (!MFI->getMode().IEEE) {
2829 if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
2830 MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
2831 return true;
2832
2833 return !IsIEEEOp;
2834 }
2835
2836 if (IsIEEEOp)
2837 return true;
2838
2840}
2841
2844 MachineIRBuilder &B) const {
2845 // TODO: Should move some of this into LegalizerHelper.
2846
2847 // TODO: Promote dynamic indexing of s16 to s32
2848
2849 Register Dst = MI.getOperand(0).getReg();
2850 Register Vec = MI.getOperand(1).getReg();
2851
2852 LLT VecTy = MRI.getType(Vec);
2853 LLT EltTy = VecTy.getElementType();
2854 assert(EltTy == MRI.getType(Dst));
2855
2856 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2857 // but we can't go directly to that logic becasue you can't bitcast a vector
2858 // of pointers to a vector of integers. Therefore, introduce an intermediate
2859 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2860 // drive the legalization forward.
2861 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2862 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2863 LLT IntVecTy = VecTy.changeElementType(IntTy);
2864
2865 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2866 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2867 B.buildIntToPtr(Dst, IntElt);
2868
2869 MI.eraseFromParent();
2870 return true;
2871 }
2872
2873 // FIXME: Artifact combiner probably should have replaced the truncated
2874 // constant before this, so we shouldn't need
2875 // getIConstantVRegValWithLookThrough.
2876 std::optional<ValueAndVReg> MaybeIdxVal =
2877 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2878 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2879 return true;
2880 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2881
2882 if (IdxVal < VecTy.getNumElements()) {
2883 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2884 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2885 } else {
2886 B.buildUndef(Dst);
2887 }
2888
2889 MI.eraseFromParent();
2890 return true;
2891}
2892
2895 MachineIRBuilder &B) const {
2896 // TODO: Should move some of this into LegalizerHelper.
2897
2898 // TODO: Promote dynamic indexing of s16 to s32
2899
2900 Register Dst = MI.getOperand(0).getReg();
2901 Register Vec = MI.getOperand(1).getReg();
2902 Register Ins = MI.getOperand(2).getReg();
2903
2904 LLT VecTy = MRI.getType(Vec);
2905 LLT EltTy = VecTy.getElementType();
2906 assert(EltTy == MRI.getType(Ins));
2907
2908 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2909 // but we can't go directly to that logic becasue you can't bitcast a vector
2910 // of pointers to a vector of integers. Therefore, make the pointer vector
2911 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2912 // new value, and then inttoptr the result vector back. This will then allow
2913 // the rest of legalization to take over.
2914 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2915 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2916 LLT IntVecTy = VecTy.changeElementType(IntTy);
2917
2918 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2919 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2920 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2921 MI.getOperand(3));
2922 B.buildIntToPtr(Dst, IntVecDest);
2923 MI.eraseFromParent();
2924 return true;
2925 }
2926
2927 // FIXME: Artifact combiner probably should have replaced the truncated
2928 // constant before this, so we shouldn't need
2929 // getIConstantVRegValWithLookThrough.
2930 std::optional<ValueAndVReg> MaybeIdxVal =
2931 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2932 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2933 return true;
2934
2935 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2936
2937 unsigned NumElts = VecTy.getNumElements();
2938 if (IdxVal < NumElts) {
2940 for (unsigned i = 0; i < NumElts; ++i)
2941 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2942 B.buildUnmerge(SrcRegs, Vec);
2943
2944 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2945 B.buildMergeLikeInstr(Dst, SrcRegs);
2946 } else {
2947 B.buildUndef(Dst);
2948 }
2949
2950 MI.eraseFromParent();
2951 return true;
2952}
2953
2956 MachineIRBuilder &B) const {
2957
2958 Register DstReg = MI.getOperand(0).getReg();
2959 Register SrcReg = MI.getOperand(1).getReg();
2960 LLT Ty = MRI.getType(DstReg);
2961 unsigned Flags = MI.getFlags();
2962
2963 Register TrigVal;
2964 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2965 if (ST.hasTrigReducedRange()) {
2966 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2967 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2968 .addUse(MulVal.getReg(0))
2969 .setMIFlags(Flags)
2970 .getReg(0);
2971 } else
2972 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2973
2974 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2975 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2976 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2977 .addUse(TrigVal)
2978 .setMIFlags(Flags);
2979 MI.eraseFromParent();
2980 return true;
2981}
2982
2985 const GlobalValue *GV,
2986 int64_t Offset,
2987 unsigned GAFlags) const {
2988 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2989 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2990 // to the following code sequence:
2991 //
2992 // For constant address space:
2993 // s_getpc_b64 s[0:1]
2994 // s_add_u32 s0, s0, $symbol
2995 // s_addc_u32 s1, s1, 0
2996 //
2997 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2998 // a fixup or relocation is emitted to replace $symbol with a literal
2999 // constant, which is a pc-relative offset from the encoding of the $symbol
3000 // operand to the global variable.
3001 //
3002 // For global address space:
3003 // s_getpc_b64 s[0:1]
3004 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3005 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3006 //
3007 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3008 // fixups or relocations are emitted to replace $symbol@*@lo and
3009 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3010 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3011 // operand to the global variable.
3012
3014
3015 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3016 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3017
3018 if (ST.has64BitLiterals()) {
3019 assert(GAFlags != SIInstrInfo::MO_NONE);
3020
3022 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3023 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3024 } else {
3026 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3027
3028 MIB.addGlobalAddress(GV, Offset, GAFlags);
3029 if (GAFlags == SIInstrInfo::MO_NONE)
3030 MIB.addImm(0);
3031 else
3032 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3033 }
3034
3035 if (!B.getMRI()->getRegClassOrNull(PCReg))
3036 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3037
3038 if (PtrTy.getSizeInBits() == 32)
3039 B.buildExtract(DstReg, PCReg, 0);
3040 return true;
3041}
3042
3043// Emit a ABS32_LO / ABS32_HI relocation stub.
3045 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3046 MachineRegisterInfo &MRI) const {
3047 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3048
3049 if (RequiresHighHalf && ST.has64BitLiterals()) {
3050 if (!MRI.getRegClassOrNull(DstReg))
3051 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3052 B.buildInstr(AMDGPU::S_MOV_B64)
3053 .addDef(DstReg)
3054 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3055 return;
3056 }
3057
3058 LLT S32 = LLT::scalar(32);
3059
3060 // Use the destination directly, if and only if we store the lower address
3061 // part only and we don't have a register class being set.
3062 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3063 ? DstReg
3064 : MRI.createGenericVirtualRegister(S32);
3065
3066 if (!MRI.getRegClassOrNull(AddrLo))
3067 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3068
3069 // Write the lower half.
3070 B.buildInstr(AMDGPU::S_MOV_B32)
3071 .addDef(AddrLo)
3072 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3073
3074 // If required, write the upper half as well.
3075 if (RequiresHighHalf) {
3076 assert(PtrTy.getSizeInBits() == 64 &&
3077 "Must provide a 64-bit pointer type!");
3078
3079 Register AddrHi = MRI.createGenericVirtualRegister(S32);
3080 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3081
3082 B.buildInstr(AMDGPU::S_MOV_B32)
3083 .addDef(AddrHi)
3084 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3085
3086 // Use the destination directly, if and only if we don't have a register
3087 // class being set.
3088 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3089 ? DstReg
3090 : MRI.createGenericVirtualRegister(LLT::scalar(64));
3091
3092 if (!MRI.getRegClassOrNull(AddrDst))
3093 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3094
3095 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3096
3097 // If we created a new register for the destination, cast the result into
3098 // the final output.
3099 if (AddrDst != DstReg)
3100 B.buildCast(DstReg, AddrDst);
3101 } else if (AddrLo != DstReg) {
3102 // If we created a new register for the destination, cast the result into
3103 // the final output.
3104 B.buildCast(DstReg, AddrLo);
3105 }
3106}
3107
3110 MachineIRBuilder &B) const {
3111 Register DstReg = MI.getOperand(0).getReg();
3112 LLT Ty = MRI.getType(DstReg);
3113 unsigned AS = Ty.getAddressSpace();
3114
3115 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3116 MachineFunction &MF = B.getMF();
3118
3120 if (!MFI->isModuleEntryFunction() &&
3121 GV->getName() != "llvm.amdgcn.module.lds" &&
3123 const Function &Fn = MF.getFunction();
3125 Fn, "local memory global used by non-kernel function",
3126 MI.getDebugLoc(), DS_Warning));
3127
3128 // We currently don't have a way to correctly allocate LDS objects that
3129 // aren't directly associated with a kernel. We do force inlining of
3130 // functions that use local objects. However, if these dead functions are
3131 // not eliminated, we don't want a compile time error. Just emit a warning
3132 // and a trap, since there should be no callable path here.
3133 B.buildTrap();
3134 B.buildUndef(DstReg);
3135 MI.eraseFromParent();
3136 return true;
3137 }
3138
3139 // TODO: We could emit code to handle the initialization somewhere.
3140 // We ignore the initializer for now and legalize it to allow selection.
3141 // The initializer will anyway get errored out during assembly emission.
3142 const SITargetLowering *TLI = ST.getTargetLowering();
3143 if (!TLI->shouldUseLDSConstAddress(GV)) {
3144 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3145 return true; // Leave in place;
3146 }
3147
3148 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3149 Type *Ty = GV->getValueType();
3150 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3151 // zero-sized type in other languages to declare the dynamic shared
3152 // memory which size is not known at the compile time. They will be
3153 // allocated by the runtime and placed directly after the static
3154 // allocated ones. They all share the same offset.
3155 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3156 // Adjust alignment for that dynamic shared memory array.
3158 LLT S32 = LLT::scalar(32);
3159 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3160 B.buildIntToPtr(DstReg, Sz);
3161 MI.eraseFromParent();
3162 return true;
3163 }
3164 }
3165
3166 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3167 *cast<GlobalVariable>(GV)));
3168 MI.eraseFromParent();
3169 return true;
3170 }
3171
3172 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3173 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3174 MI.eraseFromParent();
3175 return true;
3176 }
3177
3178 const SITargetLowering *TLI = ST.getTargetLowering();
3179
3180 if (TLI->shouldEmitFixup(GV)) {
3181 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3182 MI.eraseFromParent();
3183 return true;
3184 }
3185
3186 if (TLI->shouldEmitPCReloc(GV)) {
3187 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3188 MI.eraseFromParent();
3189 return true;
3190 }
3191
3193 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3194
3195 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3200 LoadTy, Align(8));
3201
3202 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3203
3204 if (Ty.getSizeInBits() == 32) {
3205 // Truncate if this is a 32-bit constant address.
3206 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3207 B.buildExtract(DstReg, Load, 0);
3208 } else
3209 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3210
3211 MI.eraseFromParent();
3212 return true;
3213}
3214
3216 if (Ty.isVector())
3217 return Ty.changeElementCount(
3218 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3219 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3220}
3221
3223 MachineInstr &MI) const {
3224 MachineIRBuilder &B = Helper.MIRBuilder;
3225 MachineRegisterInfo &MRI = *B.getMRI();
3226 GISelChangeObserver &Observer = Helper.Observer;
3227
3228 Register PtrReg = MI.getOperand(1).getReg();
3229 LLT PtrTy = MRI.getType(PtrReg);
3230 unsigned AddrSpace = PtrTy.getAddressSpace();
3231
3232 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3234 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3235 Observer.changingInstr(MI);
3236 MI.getOperand(1).setReg(Cast.getReg(0));
3237 Observer.changedInstr(MI);
3238 return true;
3239 }
3240
3241 if (MI.getOpcode() != AMDGPU::G_LOAD)
3242 return false;
3243
3244 Register ValReg = MI.getOperand(0).getReg();
3245 LLT ValTy = MRI.getType(ValReg);
3246
3247 if (hasBufferRsrcWorkaround(ValTy)) {
3248 Observer.changingInstr(MI);
3250 Observer.changedInstr(MI);
3251 return true;
3252 }
3253
3254 MachineMemOperand *MMO = *MI.memoperands_begin();
3255 const unsigned ValSize = ValTy.getSizeInBits();
3256 const LLT MemTy = MMO->getMemoryType();
3257 const Align MemAlign = MMO->getAlign();
3258 const unsigned MemSize = MemTy.getSizeInBits();
3259 const uint64_t AlignInBits = 8 * MemAlign.value();
3260
3261 // Widen non-power-of-2 loads to the alignment if needed
3262 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3263 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3264
3265 // This was already the correct extending load result type, so just adjust
3266 // the memory type.
3267 if (WideMemSize == ValSize) {
3268 MachineFunction &MF = B.getMF();
3269
3270 MachineMemOperand *WideMMO =
3271 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3272 Observer.changingInstr(MI);
3273 MI.setMemRefs(MF, {WideMMO});
3274 Observer.changedInstr(MI);
3275 return true;
3276 }
3277
3278 // Don't bother handling edge case that should probably never be produced.
3279 if (ValSize > WideMemSize)
3280 return false;
3281
3282 LLT WideTy = widenToNextPowerOf2(ValTy);
3283
3284 Register WideLoad;
3285 if (!WideTy.isVector()) {
3286 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3287 B.buildTrunc(ValReg, WideLoad).getReg(0);
3288 } else {
3289 // Extract the subvector.
3290
3291 if (isRegisterType(ST, ValTy)) {
3292 // If this a case where G_EXTRACT is legal, use it.
3293 // (e.g. <3 x s32> -> <4 x s32>)
3294 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3295 B.buildExtract(ValReg, WideLoad, 0);
3296 } else {
3297 // For cases where the widened type isn't a nice register value, unmerge
3298 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3299 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3300 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3301 }
3302 }
3303
3304 MI.eraseFromParent();
3305 return true;
3306 }
3307
3308 return false;
3309}
3310
3312 MachineInstr &MI) const {
3313 MachineIRBuilder &B = Helper.MIRBuilder;
3314 MachineRegisterInfo &MRI = *B.getMRI();
3315 GISelChangeObserver &Observer = Helper.Observer;
3316
3317 Register DataReg = MI.getOperand(0).getReg();
3318 LLT DataTy = MRI.getType(DataReg);
3319
3320 if (hasBufferRsrcWorkaround(DataTy)) {
3321 Observer.changingInstr(MI);
3323 Observer.changedInstr(MI);
3324 return true;
3325 }
3326 return false;
3327}
3328
3331 MachineIRBuilder &B) const {
3332 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3333 assert(Ty.isScalar());
3334
3335 MachineFunction &MF = B.getMF();
3337
3338 // TODO: Always legal with future ftz flag.
3339 // FIXME: Do we need just output?
3340 if (Ty == LLT::float32() &&
3342 return true;
3343 if (Ty == LLT::float16() &&
3345 return true;
3346
3347 MachineIRBuilder HelperBuilder(MI);
3348 GISelObserverWrapper DummyObserver;
3349 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3350 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3351}
3352
3355 Register DstReg = MI.getOperand(0).getReg();
3356 Register PtrReg = MI.getOperand(1).getReg();
3357 Register CmpVal = MI.getOperand(2).getReg();
3358 Register NewVal = MI.getOperand(3).getReg();
3359
3360 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3361 "this should not have been custom lowered");
3362
3363 LLT ValTy = MRI.getType(CmpVal);
3364 LLT VecTy = LLT::fixed_vector(2, ValTy);
3365
3366 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3367
3368 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3369 .addDef(DstReg)
3370 .addUse(PtrReg)
3371 .addUse(PackedVal)
3372 .setMemRefs(MI.memoperands());
3373
3374 MI.eraseFromParent();
3375 return true;
3376}
3377
3378/// Return true if it's known that \p Src can never be an f32 denormal value.
3380 Register Src) {
3381 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3382 switch (DefMI->getOpcode()) {
3383 case TargetOpcode::G_INTRINSIC: {
3385 case Intrinsic::amdgcn_frexp_mant:
3386 return true;
3387 default:
3388 break;
3389 }
3390
3391 break;
3392 }
3393 case TargetOpcode::G_FFREXP: {
3394 if (DefMI->getOperand(0).getReg() == Src)
3395 return true;
3396 break;
3397 }
3398 case TargetOpcode::G_FPEXT: {
3399 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3400 }
3401 default:
3402 return false;
3403 }
3404
3405 return false;
3406}
3407
3408static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3409 return Flags & MachineInstr::FmAfn;
3410}
3411
3413 unsigned Flags) {
3414 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3417}
3418
3419std::pair<Register, Register>
3421 unsigned Flags) const {
3422 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3423 return {};
3424
3425 const LLT F32 = LLT::scalar(32);
3426 auto SmallestNormal = B.buildFConstant(
3428 auto IsLtSmallestNormal =
3429 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3430
3431 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3432 auto One = B.buildFConstant(F32, 1.0);
3433 auto ScaleFactor =
3434 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3435 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3436
3437 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3438}
3439
3441 MachineIRBuilder &B) const {
3442 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3443 // If we have to handle denormals, scale up the input and adjust the result.
3444
3445 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3446 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3447
3448 Register Dst = MI.getOperand(0).getReg();
3449 Register Src = MI.getOperand(1).getReg();
3450 LLT Ty = B.getMRI()->getType(Dst);
3451 unsigned Flags = MI.getFlags();
3452
3453 if (Ty == LLT::scalar(16)) {
3454 const LLT F32 = LLT::scalar(32);
3455 // Nothing in half is a denormal when promoted to f32.
3456 auto Ext = B.buildFPExt(F32, Src, Flags);
3457 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3458 .addUse(Ext.getReg(0))
3459 .setMIFlags(Flags);
3460 B.buildFPTrunc(Dst, Log2, Flags);
3461 MI.eraseFromParent();
3462 return true;
3463 }
3464
3465 assert(Ty == LLT::scalar(32));
3466
3467 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3468 if (!ScaledInput) {
3469 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3470 .addUse(Src)
3471 .setMIFlags(Flags);
3472 MI.eraseFromParent();
3473 return true;
3474 }
3475
3476 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3477 .addUse(ScaledInput)
3478 .setMIFlags(Flags);
3479
3480 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3481 auto Zero = B.buildFConstant(Ty, 0.0);
3482 auto ResultOffset =
3483 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3484 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3485
3486 MI.eraseFromParent();
3487 return true;
3488}
3489
3491 Register Z, unsigned Flags) {
3492 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3493 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3494}
3495
3497 MachineIRBuilder &B) const {
3498 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3499 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3500
3501 MachineRegisterInfo &MRI = *B.getMRI();
3502 Register Dst = MI.getOperand(0).getReg();
3503 Register X = MI.getOperand(1).getReg();
3504 unsigned Flags = MI.getFlags();
3505 const LLT Ty = MRI.getType(X);
3506 MachineFunction &MF = B.getMF();
3507
3508 const LLT F32 = LLT::scalar(32);
3509 const LLT F16 = LLT::scalar(16);
3510
3511 const AMDGPUTargetMachine &TM =
3512 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3513
3514 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3515 if (Ty == F16 && !ST.has16BitInsts()) {
3516 Register LogVal = MRI.createGenericVirtualRegister(F32);
3517 auto PromoteSrc = B.buildFPExt(F32, X);
3518 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3519 B.buildFPTrunc(Dst, LogVal);
3520 } else {
3521 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3522 }
3523
3524 MI.eraseFromParent();
3525 return true;
3526 }
3527
3528 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3529 if (ScaledInput)
3530 X = ScaledInput;
3531
3532 auto Y =
3533 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3534
3535 Register R;
3536 if (ST.hasFastFMAF32()) {
3537 // c+cc are ln(2)/ln(10) to more than 49 bits
3538 const float c_log10 = 0x1.344134p-2f;
3539 const float cc_log10 = 0x1.09f79ep-26f;
3540
3541 // c + cc is ln(2) to more than 49 bits
3542 const float c_log = 0x1.62e42ep-1f;
3543 const float cc_log = 0x1.efa39ep-25f;
3544
3545 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3546 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3547
3548 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3549 auto NegR = B.buildFNeg(Ty, R, Flags);
3550 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3551 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3552 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3553 } else {
3554 // ch+ct is ln(2)/ln(10) to more than 36 bits
3555 const float ch_log10 = 0x1.344000p-2f;
3556 const float ct_log10 = 0x1.3509f6p-18f;
3557
3558 // ch + ct is ln(2) to more than 36 bits
3559 const float ch_log = 0x1.62e000p-1f;
3560 const float ct_log = 0x1.0bfbe8p-15f;
3561
3562 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3563 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3564
3565 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3566 auto YH = B.buildAnd(Ty, Y, MaskConst);
3567 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3568 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3569
3570 Register Mad0 =
3571 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3572 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3573 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3574 }
3575
3576 const bool IsFiniteOnly =
3577 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3578 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3579
3580 if (!IsFiniteOnly) {
3581 // Expand isfinite(x) => fabs(x) < inf
3582 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3583 auto Fabs = B.buildFAbs(Ty, Y);
3584 auto IsFinite =
3585 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3586 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3587 }
3588
3589 if (ScaledInput) {
3590 auto Zero = B.buildFConstant(Ty, 0.0);
3591 auto ShiftK =
3592 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3593 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3594 B.buildFSub(Dst, R, Shift, Flags);
3595 } else {
3596 B.buildCopy(Dst, R);
3597 }
3598
3599 MI.eraseFromParent();
3600 return true;
3601}
3602
3604 Register Src, bool IsLog10,
3605 unsigned Flags) const {
3606 const double Log2BaseInverted =
3608
3609 LLT Ty = B.getMRI()->getType(Dst);
3610
3611 if (Ty == LLT::scalar(32)) {
3612 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3613 if (ScaledInput) {
3614 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3615 .addUse(Src)
3616 .setMIFlags(Flags);
3617 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3618 auto Zero = B.buildFConstant(Ty, 0.0);
3619 auto ResultOffset =
3620 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3621 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3622
3623 if (ST.hasFastFMAF32())
3624 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3625 else {
3626 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3627 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3628 }
3629
3630 return true;
3631 }
3632 }
3633
3634 auto Log2Operand = Ty == LLT::scalar(16)
3635 ? B.buildFLog2(Ty, Src, Flags)
3636 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3637 .addUse(Src)
3638 .setMIFlags(Flags);
3639 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3640 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3641 return true;
3642}
3643
3645 MachineIRBuilder &B) const {
3646 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3647 // If we have to handle denormals, scale up the input and adjust the result.
3648
3649 Register Dst = MI.getOperand(0).getReg();
3650 Register Src = MI.getOperand(1).getReg();
3651 unsigned Flags = MI.getFlags();
3652 LLT Ty = B.getMRI()->getType(Dst);
3653 const LLT F16 = LLT::scalar(16);
3654 const LLT F32 = LLT::scalar(32);
3655
3656 if (Ty == F16) {
3657 // Nothing in half is a denormal when promoted to f32.
3658 auto Ext = B.buildFPExt(F32, Src, Flags);
3659 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3660 .addUse(Ext.getReg(0))
3661 .setMIFlags(Flags);
3662 B.buildFPTrunc(Dst, Log2, Flags);
3663 MI.eraseFromParent();
3664 return true;
3665 }
3666
3667 assert(Ty == F32);
3668
3669 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3670 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3671 .addUse(Src)
3672 .setMIFlags(Flags);
3673 MI.eraseFromParent();
3674 return true;
3675 }
3676
3677 // bool needs_scaling = x < -0x1.f80000p+6f;
3678 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3679
3680 // -nextafter(128.0, -1)
3681 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3682 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3683 RangeCheckConst, Flags);
3684
3685 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3686 auto Zero = B.buildFConstant(Ty, 0.0);
3687 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3688 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3689
3690 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3691 .addUse(AddInput.getReg(0))
3692 .setMIFlags(Flags);
3693
3694 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3695 auto One = B.buildFConstant(Ty, 1.0);
3696 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3697 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3698 MI.eraseFromParent();
3699 return true;
3700}
3701
3703 Register X, unsigned Flags) const {
3704 LLT Ty = B.getMRI()->getType(Dst);
3705 LLT F32 = LLT::scalar(32);
3706
3707 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3708 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3709 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3710
3711 if (Ty == F32) {
3712 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3713 .addUse(Mul.getReg(0))
3714 .setMIFlags(Flags);
3715 } else {
3716 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3717 }
3718
3719 return true;
3720 }
3721
3722 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3723 auto NeedsScaling =
3724 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3725 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3726 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3727 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3728
3729 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3730 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3731
3732 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3733 .addUse(ExpInput.getReg(0))
3734 .setMIFlags(Flags);
3735
3736 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3737 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3738 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3739 return true;
3740}
3741
3743 MachineIRBuilder &B) const {
3744 Register Dst = MI.getOperand(0).getReg();
3745 Register X = MI.getOperand(1).getReg();
3746 const unsigned Flags = MI.getFlags();
3747 MachineFunction &MF = B.getMF();
3748 MachineRegisterInfo &MRI = *B.getMRI();
3749 LLT Ty = MRI.getType(Dst);
3750 const LLT F16 = LLT::scalar(16);
3751 const LLT F32 = LLT::scalar(32);
3752 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3753
3754 if (Ty == F16) {
3755 // v_exp_f16 (fmul x, log2e)
3756 if (allowApproxFunc(MF, Flags)) {
3757 // TODO: Does this really require fast?
3758 legalizeFExpUnsafe(B, Dst, X, Flags);
3759 MI.eraseFromParent();
3760 return true;
3761 }
3762
3763 // exp(f16 x) ->
3764 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3765
3766 // Nothing in half is a denormal when promoted to f32.
3767 auto Ext = B.buildFPExt(F32, X, Flags);
3768 Register Lowered = MRI.createGenericVirtualRegister(F32);
3769 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3770 B.buildFPTrunc(Dst, Lowered, Flags);
3771 MI.eraseFromParent();
3772 return true;
3773 }
3774
3775 assert(Ty == F32);
3776
3777 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3778 // library behavior. Also, is known-not-daz source sufficient?
3779 if (allowApproxFunc(MF, Flags)) {
3780 legalizeFExpUnsafe(B, Dst, X, Flags);
3781 MI.eraseFromParent();
3782 return true;
3783 }
3784
3785 // Algorithm:
3786 //
3787 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3788 //
3789 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3790 // n = 64*m + j, 0 <= j < 64
3791 //
3792 // e^x = 2^((64*m + j + f)/64)
3793 // = (2^m) * (2^(j/64)) * 2^(f/64)
3794 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3795 //
3796 // f = x*(64/ln(2)) - n
3797 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3798 //
3799 // e^x = (2^m) * (2^(j/64)) * e^r
3800 //
3801 // (2^(j/64)) is precomputed
3802 //
3803 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3804 // e^r = 1 + q
3805 //
3806 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3807 //
3808 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3809 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3810 Register PH, PL;
3811
3812 if (ST.hasFastFMAF32()) {
3813 const float c_exp = numbers::log2ef;
3814 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3815 const float c_exp10 = 0x1.a934f0p+1f;
3816 const float cc_exp10 = 0x1.2f346ep-24f;
3817
3818 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3819 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3820 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3821 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3822
3823 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3824 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3825 } else {
3826 const float ch_exp = 0x1.714000p+0f;
3827 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3828
3829 const float ch_exp10 = 0x1.a92000p+1f;
3830 const float cl_exp10 = 0x1.4f0978p-11f;
3831
3832 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3833 auto XH = B.buildAnd(Ty, X, MaskConst);
3834 auto XL = B.buildFSub(Ty, X, XH, Flags);
3835
3836 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3837 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3838
3839 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3840 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3841
3842 Register Mad0 =
3843 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3844 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3845 }
3846
3847 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3848
3849 // It is unsafe to contract this fsub into the PH multiply.
3850 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3851 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3852 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3853
3854 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3855 .addUse(A.getReg(0))
3856 .setMIFlags(Flags);
3857 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3858
3859 auto UnderflowCheckConst =
3860 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3861 auto Zero = B.buildFConstant(Ty, 0.0);
3862 auto Underflow =
3863 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3864
3865 R = B.buildSelect(Ty, Underflow, Zero, R);
3866
3867 const auto &Options = MF.getTarget().Options;
3868
3869 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3870 auto OverflowCheckConst =
3871 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3872
3873 auto Overflow =
3874 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3875 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3876 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3877 }
3878
3879 B.buildCopy(Dst, R);
3880 MI.eraseFromParent();
3881 return true;
3882}
3883
3885 MachineIRBuilder &B) const {
3886 Register Dst = MI.getOperand(0).getReg();
3887 Register Src0 = MI.getOperand(1).getReg();
3888 Register Src1 = MI.getOperand(2).getReg();
3889 unsigned Flags = MI.getFlags();
3890 LLT Ty = B.getMRI()->getType(Dst);
3891 const LLT F16 = LLT::float16();
3892 const LLT F32 = LLT::float32();
3893
3894 if (Ty == F32) {
3895 auto Log = B.buildFLog2(F32, Src0, Flags);
3896 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3897 .addUse(Log.getReg(0))
3898 .addUse(Src1)
3899 .setMIFlags(Flags);
3900 B.buildFExp2(Dst, Mul, Flags);
3901 } else if (Ty == F16) {
3902 // There's no f16 fmul_legacy, so we need to convert for it.
3903 auto Log = B.buildFLog2(F16, Src0, Flags);
3904 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3905 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3906 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3907 .addUse(Ext0.getReg(0))
3908 .addUse(Ext1.getReg(0))
3909 .setMIFlags(Flags);
3910 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3911 } else
3912 return false;
3913
3914 MI.eraseFromParent();
3915 return true;
3916}
3917
3918// Find a source register, ignoring any possible source modifiers.
3920 Register ModSrc = OrigSrc;
3921 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3922 ModSrc = SrcFNeg->getOperand(1).getReg();
3923 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3924 ModSrc = SrcFAbs->getOperand(1).getReg();
3925 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3926 ModSrc = SrcFAbs->getOperand(1).getReg();
3927 return ModSrc;
3928}
3929
3932 MachineIRBuilder &B) const {
3933
3934 const LLT S1 = LLT::scalar(1);
3935 const LLT F64 = LLT::float64();
3936 Register Dst = MI.getOperand(0).getReg();
3937 Register OrigSrc = MI.getOperand(1).getReg();
3938 unsigned Flags = MI.getFlags();
3939 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3940 "this should not have been custom lowered");
3941
3942 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3943 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3944 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3945 // V_FRACT bug is:
3946 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3947 //
3948 // Convert floor(x) to (x - fract(x))
3949
3950 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3951 .addUse(OrigSrc)
3952 .setMIFlags(Flags);
3953
3954 // Give source modifier matching some assistance before obscuring a foldable
3955 // pattern.
3956
3957 // TODO: We can avoid the neg on the fract? The input sign to fract
3958 // shouldn't matter?
3959 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3960
3961 auto Const =
3962 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3963
3964 Register Min = MRI.createGenericVirtualRegister(F64);
3965
3966 // We don't need to concern ourselves with the snan handling difference, so
3967 // use the one which will directly select.
3968 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3969 if (MFI->getMode().IEEE)
3970 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3971 else
3972 B.buildFMinNum(Min, Fract, Const, Flags);
3973
3974 Register CorrectedFract = Min;
3975 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3976 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3977 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3978 }
3979
3980 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3981 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3982
3983 MI.eraseFromParent();
3984 return true;
3985}
3986
3987// Turn an illegal packed v2s16 build vector into bit operations.
3988// TODO: This should probably be a bitcast action in LegalizerHelper.
3991 Register Dst = MI.getOperand(0).getReg();
3992 const LLT S32 = LLT::scalar(32);
3993 const LLT S16 = LLT::scalar(16);
3994 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3995
3996 Register Src0 = MI.getOperand(1).getReg();
3997 Register Src1 = MI.getOperand(2).getReg();
3998
3999 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4000 assert(MRI.getType(Src0) == S32);
4001 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4002 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4003 }
4004
4005 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4006 B.buildBitcast(Dst, Merge);
4007
4008 MI.eraseFromParent();
4009 return true;
4010}
4011
4012// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4013//
4014// Source and accumulation registers must all be 32-bits.
4015//
4016// TODO: When the multiply is uniform, we should produce a code sequence
4017// that is better suited to instruction selection on the SALU. Instead of
4018// the outer loop going over parts of the result, the outer loop should go
4019// over parts of one of the factors. This should result in instruction
4020// selection that makes full use of S_ADDC_U32 instructions.
4023 ArrayRef<Register> Src0,
4024 ArrayRef<Register> Src1,
4025 bool UsePartialMad64_32,
4026 bool SeparateOddAlignedProducts) const {
4027 // Use (possibly empty) vectors of S1 registers to represent the set of
4028 // carries from one pair of positions to the next.
4029 using Carry = SmallVector<Register, 2>;
4030
4031 MachineIRBuilder &B = Helper.MIRBuilder;
4032 GISelValueTracking &VT = *Helper.getValueTracking();
4033
4034 const LLT S1 = LLT::scalar(1);
4035 const LLT S32 = LLT::scalar(32);
4036 const LLT S64 = LLT::scalar(64);
4037
4038 Register Zero32;
4039 Register Zero64;
4040
4041 auto getZero32 = [&]() -> Register {
4042 if (!Zero32)
4043 Zero32 = B.buildConstant(S32, 0).getReg(0);
4044 return Zero32;
4045 };
4046 auto getZero64 = [&]() -> Register {
4047 if (!Zero64)
4048 Zero64 = B.buildConstant(S64, 0).getReg(0);
4049 return Zero64;
4050 };
4051
4052 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4053 for (unsigned i = 0; i < Src0.size(); ++i) {
4054 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4055 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4056 }
4057
4058 // Merge the given carries into the 32-bit LocalAccum, which is modified
4059 // in-place.
4060 //
4061 // Returns the carry-out, which is a single S1 register or null.
4062 auto mergeCarry =
4063 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4064 if (CarryIn.empty())
4065 return Register();
4066
4067 bool HaveCarryOut = true;
4068 Register CarryAccum;
4069 if (CarryIn.size() == 1) {
4070 if (!LocalAccum) {
4071 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4072 return Register();
4073 }
4074
4075 CarryAccum = getZero32();
4076 } else {
4077 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4078 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4079 CarryAccum =
4080 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4081 .getReg(0);
4082 }
4083
4084 if (!LocalAccum) {
4085 LocalAccum = getZero32();
4086 HaveCarryOut = false;
4087 }
4088 }
4089
4090 auto Add =
4091 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4092 LocalAccum = Add.getReg(0);
4093 return HaveCarryOut ? Add.getReg(1) : Register();
4094 };
4095
4096 // Build a multiply-add chain to compute
4097 //
4098 // LocalAccum + (partial products at DstIndex)
4099 // + (opportunistic subset of CarryIn)
4100 //
4101 // LocalAccum is an array of one or two 32-bit registers that are updated
4102 // in-place. The incoming registers may be null.
4103 //
4104 // In some edge cases, carry-ins can be consumed "for free". In that case,
4105 // the consumed carry bits are removed from CarryIn in-place.
4106 auto buildMadChain =
4107 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4108 -> Carry {
4109 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4110 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4111
4112 Carry CarryOut;
4113 unsigned j0 = 0;
4114
4115 // Use plain 32-bit multiplication for the most significant part of the
4116 // result by default.
4117 if (LocalAccum.size() == 1 &&
4118 (!UsePartialMad64_32 || !CarryIn.empty())) {
4119 do {
4120 // Skip multiplication if one of the operands is 0
4121 unsigned j1 = DstIndex - j0;
4122 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4123 ++j0;
4124 continue;
4125 }
4126 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4127 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4128 LocalAccum[0] = Mul.getReg(0);
4129 } else {
4130 if (CarryIn.empty()) {
4131 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4132 } else {
4133 LocalAccum[0] =
4134 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4135 .getReg(0);
4136 CarryIn.pop_back();
4137 }
4138 }
4139 ++j0;
4140 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4141 }
4142
4143 // Build full 64-bit multiplies.
4144 if (j0 <= DstIndex) {
4145 bool HaveSmallAccum = false;
4146 Register Tmp;
4147
4148 if (LocalAccum[0]) {
4149 if (LocalAccum.size() == 1) {
4150 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4151 HaveSmallAccum = true;
4152 } else if (LocalAccum[1]) {
4153 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4154 HaveSmallAccum = false;
4155 } else {
4156 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4157 HaveSmallAccum = true;
4158 }
4159 } else {
4160 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4161 Tmp = getZero64();
4162 HaveSmallAccum = true;
4163 }
4164
4165 do {
4166 unsigned j1 = DstIndex - j0;
4167 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4168 ++j0;
4169 continue;
4170 }
4171 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4172 {Src0[j0], Src1[j1], Tmp});
4173 Tmp = Mad.getReg(0);
4174 if (!HaveSmallAccum)
4175 CarryOut.push_back(Mad.getReg(1));
4176 HaveSmallAccum = false;
4177
4178 ++j0;
4179 } while (j0 <= DstIndex);
4180
4181 auto Unmerge = B.buildUnmerge(S32, Tmp);
4182 LocalAccum[0] = Unmerge.getReg(0);
4183 if (LocalAccum.size() > 1)
4184 LocalAccum[1] = Unmerge.getReg(1);
4185 }
4186
4187 return CarryOut;
4188 };
4189
4190 // Outer multiply loop, iterating over destination parts from least
4191 // significant to most significant parts.
4192 //
4193 // The columns of the following diagram correspond to the destination parts
4194 // affected by one iteration of the outer loop (ignoring boundary
4195 // conditions).
4196 //
4197 // Dest index relative to 2 * i: 1 0 -1
4198 // ------
4199 // Carries from previous iteration: e o
4200 // Even-aligned partial product sum: E E .
4201 // Odd-aligned partial product sum: O O
4202 //
4203 // 'o' is OddCarry, 'e' is EvenCarry.
4204 // EE and OO are computed from partial products via buildMadChain and use
4205 // accumulation where possible and appropriate.
4206 //
4207 Register SeparateOddCarry;
4208 Carry EvenCarry;
4209 Carry OddCarry;
4210
4211 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4212 Carry OddCarryIn = std::move(OddCarry);
4213 Carry EvenCarryIn = std::move(EvenCarry);
4214 OddCarry.clear();
4215 EvenCarry.clear();
4216
4217 // Partial products at offset 2 * i.
4218 if (2 * i < Accum.size()) {
4219 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4220 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4221 }
4222
4223 // Partial products at offset 2 * i - 1.
4224 if (i > 0) {
4225 if (!SeparateOddAlignedProducts) {
4226 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4227 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4228 } else {
4229 bool IsHighest = 2 * i >= Accum.size();
4230 Register SeparateOddOut[2];
4231 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4232 .take_front(IsHighest ? 1 : 2);
4233 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4234
4236
4237 if (i == 1) {
4238 if (!IsHighest)
4239 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4240 else
4241 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4242 } else {
4243 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4244 SeparateOddCarry);
4245 }
4246 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4247
4248 if (!IsHighest) {
4249 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4250 Lo->getOperand(1).getReg());
4251 Accum[2 * i] = Hi.getReg(0);
4252 SeparateOddCarry = Hi.getReg(1);
4253 }
4254 }
4255 }
4256
4257 // Add in the carries from the previous iteration
4258 if (i > 0) {
4259 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4260 EvenCarryIn.push_back(CarryOut);
4261
4262 if (2 * i < Accum.size()) {
4263 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4264 OddCarry.push_back(CarryOut);
4265 }
4266 }
4267 }
4268}
4269
4270// Custom narrowing of wide multiplies using wide multiply-add instructions.
4271//
4272// TODO: If the multiply is followed by an addition, we should attempt to
4273// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4275 MachineInstr &MI) const {
4276 assert(ST.hasMad64_32());
4277 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4278
4279 MachineIRBuilder &B = Helper.MIRBuilder;
4280 MachineRegisterInfo &MRI = *B.getMRI();
4281
4282 Register DstReg = MI.getOperand(0).getReg();
4283 Register Src0 = MI.getOperand(1).getReg();
4284 Register Src1 = MI.getOperand(2).getReg();
4285
4286 LLT Ty = MRI.getType(DstReg);
4287 assert(Ty.isScalar());
4288
4289 unsigned Size = Ty.getSizeInBits();
4290 if (ST.hasVectorMulU64() && Size == 64)
4291 return true;
4292
4293 unsigned NumParts = Size / 32;
4294 assert((Size % 32) == 0);
4295 assert(NumParts >= 2);
4296
4297 // Whether to use MAD_64_32 for partial products whose high half is
4298 // discarded. This avoids some ADD instructions but risks false dependency
4299 // stalls on some subtargets in some cases.
4300 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4301
4302 // Whether to compute odd-aligned partial products separately. This is
4303 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4304 // in an even-aligned VGPR.
4305 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4306
4307 LLT S32 = LLT::scalar(32);
4308 SmallVector<Register, 2> Src0Parts, Src1Parts;
4309 for (unsigned i = 0; i < NumParts; ++i) {
4310 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4311 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4312 }
4313 B.buildUnmerge(Src0Parts, Src0);
4314 B.buildUnmerge(Src1Parts, Src1);
4315
4316 SmallVector<Register, 2> AccumRegs(NumParts);
4317 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4318 SeparateOddAlignedProducts);
4319
4320 B.buildMergeLikeInstr(DstReg, AccumRegs);
4321 MI.eraseFromParent();
4322 return true;
4323}
4324
4325// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4326// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4327// case with a single min instruction instead of a compare+select.
4330 MachineIRBuilder &B) const {
4331 Register Dst = MI.getOperand(0).getReg();
4332 Register Src = MI.getOperand(1).getReg();
4333 LLT DstTy = MRI.getType(Dst);
4334 LLT SrcTy = MRI.getType(Src);
4335
4336 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4337 ? AMDGPU::G_AMDGPU_FFBH_U32
4338 : AMDGPU::G_AMDGPU_FFBL_B32;
4339 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4340 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4341
4342 MI.eraseFromParent();
4343 return true;
4344}
4345
4348 MachineIRBuilder &B) const {
4349 Register Dst = MI.getOperand(0).getReg();
4350 Register Src = MI.getOperand(1).getReg();
4351 LLT SrcTy = MRI.getType(Src);
4352 TypeSize NumBits = SrcTy.getSizeInBits();
4353
4354 assert(NumBits < 32u);
4355
4356 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4357 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4358 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4359 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4360 B.buildTrunc(Dst, Ctlz);
4361 MI.eraseFromParent();
4362 return true;
4363}
4364
4365// Check that this is a G_XOR x, -1
4366static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4367 if (MI.getOpcode() != TargetOpcode::G_XOR)
4368 return false;
4369 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4370 return ConstVal == -1;
4371}
4372
4373// Return the use branch instruction, otherwise null if the usage is invalid.
4374static MachineInstr *
4376 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4377 Register CondDef = MI.getOperand(0).getReg();
4378 if (!MRI.hasOneNonDBGUse(CondDef))
4379 return nullptr;
4380
4381 MachineBasicBlock *Parent = MI.getParent();
4382 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4383
4384 if (isNot(MRI, *UseMI)) {
4385 Register NegatedCond = UseMI->getOperand(0).getReg();
4386 if (!MRI.hasOneNonDBGUse(NegatedCond))
4387 return nullptr;
4388
4389 // We're deleting the def of this value, so we need to remove it.
4390 eraseInstr(*UseMI, MRI);
4391
4392 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4393 Negated = true;
4394 }
4395
4396 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4397 return nullptr;
4398
4399 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4400 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4401 if (Next == Parent->end()) {
4402 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4403 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4404 return nullptr;
4405 UncondBrTarget = &*NextMBB;
4406 } else {
4407 if (Next->getOpcode() != AMDGPU::G_BR)
4408 return nullptr;
4409 Br = &*Next;
4410 UncondBrTarget = Br->getOperand(0).getMBB();
4411 }
4412
4413 return UseMI;
4414}
4415
4418 const ArgDescriptor *Arg,
4419 const TargetRegisterClass *ArgRC,
4420 LLT ArgTy) const {
4421 MCRegister SrcReg = Arg->getRegister();
4422 assert(SrcReg.isPhysical() && "Physical register expected");
4423 assert(DstReg.isVirtual() && "Virtual register expected");
4424
4425 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4426 *ArgRC, B.getDebugLoc(), ArgTy);
4427 if (Arg->isMasked()) {
4428 // TODO: Should we try to emit this once in the entry block?
4429 const LLT S32 = LLT::scalar(32);
4430 const unsigned Mask = Arg->getMask();
4431 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4432
4433 Register AndMaskSrc = LiveIn;
4434
4435 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4436 // 0.
4437 if (Shift != 0) {
4438 auto ShiftAmt = B.buildConstant(S32, Shift);
4439 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4440 }
4441
4442 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4443 } else {
4444 B.buildCopy(DstReg, LiveIn);
4445 }
4446}
4447
4452 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4453 Register DstReg = MI.getOperand(0).getReg();
4454 if (!ST.hasClusters()) {
4455 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4456 return false;
4457 MI.eraseFromParent();
4458 return true;
4459 }
4460
4461 // Clusters are supported. Return the global position in the grid. If clusters
4462 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4463
4464 // WorkGroupIdXYZ = ClusterId == 0 ?
4465 // ClusterIdXYZ :
4466 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4467 MachineRegisterInfo &MRI = *B.getMRI();
4468 const LLT S32 = LLT::scalar(32);
4469 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4470 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4471 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4472 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4473 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4474 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4475 return false;
4476
4477 auto One = B.buildConstant(S32, 1);
4478 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4479 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4480 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4481
4482 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4483
4484 switch (MFI->getClusterDims().getKind()) {
4487 B.buildCopy(DstReg, GlobalIdXYZ);
4488 MI.eraseFromParent();
4489 return true;
4490 }
4492 B.buildCopy(DstReg, ClusterIdXYZ);
4493 MI.eraseFromParent();
4494 return true;
4495 }
4497 using namespace AMDGPU::Hwreg;
4498 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4499 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4500 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4501 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4502 .addDef(ClusterId)
4503 .addImm(ClusterIdField);
4504 auto Zero = B.buildConstant(S32, 0);
4505 auto NoClusters =
4506 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4507 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4508 MI.eraseFromParent();
4509 return true;
4510 }
4511 }
4512
4513 llvm_unreachable("nothing should reach here");
4514}
4515
4517 Register DstReg, MachineIRBuilder &B,
4519 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4520 const ArgDescriptor *Arg = nullptr;
4521 const TargetRegisterClass *ArgRC;
4522 LLT ArgTy;
4523
4524 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4525 const ArgDescriptor WorkGroupIDX =
4526 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4527 // If GridZ is not programmed in an entry function then the hardware will set
4528 // it to all zeros, so there is no need to mask the GridY value in the low
4529 // order bits.
4530 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4531 AMDGPU::TTMP7,
4532 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4533 const ArgDescriptor WorkGroupIDZ =
4534 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4535 const ArgDescriptor ClusterWorkGroupIDX =
4536 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4537 const ArgDescriptor ClusterWorkGroupIDY =
4538 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4539 const ArgDescriptor ClusterWorkGroupIDZ =
4540 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4541 const ArgDescriptor ClusterWorkGroupMaxIDX =
4542 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4543 const ArgDescriptor ClusterWorkGroupMaxIDY =
4544 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4545 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4546 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4547 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4548 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4549
4550 auto LoadConstant = [&](unsigned N) {
4551 B.buildConstant(DstReg, N);
4552 return true;
4553 };
4554
4555 if (ST.hasArchitectedSGPRs() &&
4557 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4558 bool HasFixedDims = ClusterDims.isFixedDims();
4559
4560 switch (ArgType) {
4562 Arg = &WorkGroupIDX;
4563 ArgRC = &AMDGPU::SReg_32RegClass;
4564 ArgTy = LLT::scalar(32);
4565 break;
4567 Arg = &WorkGroupIDY;
4568 ArgRC = &AMDGPU::SReg_32RegClass;
4569 ArgTy = LLT::scalar(32);
4570 break;
4572 Arg = &WorkGroupIDZ;
4573 ArgRC = &AMDGPU::SReg_32RegClass;
4574 ArgTy = LLT::scalar(32);
4575 break;
4577 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4578 return LoadConstant(0);
4579 Arg = &ClusterWorkGroupIDX;
4580 ArgRC = &AMDGPU::SReg_32RegClass;
4581 ArgTy = LLT::scalar(32);
4582 break;
4584 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4585 return LoadConstant(0);
4586 Arg = &ClusterWorkGroupIDY;
4587 ArgRC = &AMDGPU::SReg_32RegClass;
4588 ArgTy = LLT::scalar(32);
4589 break;
4591 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4592 return LoadConstant(0);
4593 Arg = &ClusterWorkGroupIDZ;
4594 ArgRC = &AMDGPU::SReg_32RegClass;
4595 ArgTy = LLT::scalar(32);
4596 break;
4598 if (HasFixedDims)
4599 return LoadConstant(ClusterDims.getDims()[0] - 1);
4600 Arg = &ClusterWorkGroupMaxIDX;
4601 ArgRC = &AMDGPU::SReg_32RegClass;
4602 ArgTy = LLT::scalar(32);
4603 break;
4605 if (HasFixedDims)
4606 return LoadConstant(ClusterDims.getDims()[1] - 1);
4607 Arg = &ClusterWorkGroupMaxIDY;
4608 ArgRC = &AMDGPU::SReg_32RegClass;
4609 ArgTy = LLT::scalar(32);
4610 break;
4612 if (HasFixedDims)
4613 return LoadConstant(ClusterDims.getDims()[2] - 1);
4614 Arg = &ClusterWorkGroupMaxIDZ;
4615 ArgRC = &AMDGPU::SReg_32RegClass;
4616 ArgTy = LLT::scalar(32);
4617 break;
4619 Arg = &ClusterWorkGroupMaxFlatID;
4620 ArgRC = &AMDGPU::SReg_32RegClass;
4621 ArgTy = LLT::scalar(32);
4622 break;
4623 default:
4624 break;
4625 }
4626 }
4627
4628 if (!Arg)
4629 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4630
4631 if (!Arg) {
4633 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4634 // which case the pointer argument may be missing and we use null.
4635 return LoadConstant(0);
4636 }
4637
4638 // It's undefined behavior if a function marked with the amdgpu-no-*
4639 // attributes uses the corresponding intrinsic.
4640 B.buildUndef(DstReg);
4641 return true;
4642 }
4643
4644 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4645 return false; // TODO: Handle these
4646 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4647 return true;
4648}
4649
4653 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4654 return false;
4655
4656 MI.eraseFromParent();
4657 return true;
4658}
4659
4661 int64_t C) {
4662 B.buildConstant(MI.getOperand(0).getReg(), C);
4663 MI.eraseFromParent();
4664 return true;
4665}
4666
4669 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4670 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4671 if (MaxID == 0)
4672 return replaceWithConstant(B, MI, 0);
4673
4674 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4675 const ArgDescriptor *Arg;
4676 const TargetRegisterClass *ArgRC;
4677 LLT ArgTy;
4678 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4679
4680 Register DstReg = MI.getOperand(0).getReg();
4681 if (!Arg) {
4682 // It's undefined behavior if a function marked with the amdgpu-no-*
4683 // attributes uses the corresponding intrinsic.
4684 B.buildUndef(DstReg);
4685 MI.eraseFromParent();
4686 return true;
4687 }
4688
4689 if (Arg->isMasked()) {
4690 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4691 // masking operations anyway.
4692 //
4693 // TODO: We could assert the top bit is 0 for the source copy.
4694 if (!loadInputValue(DstReg, B, ArgType))
4695 return false;
4696 } else {
4697 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4698 if (!loadInputValue(TmpReg, B, ArgType))
4699 return false;
4700 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4701 }
4702
4703 MI.eraseFromParent();
4704 return true;
4705}
4706
4708 int64_t Offset) const {
4710 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4711
4712 // TODO: If we passed in the base kernel offset we could have a better
4713 // alignment than 4, but we don't really need it.
4714 if (!loadInputValue(KernArgReg, B,
4716 llvm_unreachable("failed to find kernarg segment ptr");
4717
4718 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4719 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4720}
4721
4722/// Legalize a value that's loaded from kernel arguments. This is only used by
4723/// legacy intrinsics.
4727 Align Alignment) const {
4728 Register DstReg = MI.getOperand(0).getReg();
4729
4730 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4731 "unexpected kernarg parameter type");
4732
4735 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4738 MI.eraseFromParent();
4739 return true;
4740}
4741
4744 MachineIRBuilder &B) const {
4745 Register Dst = MI.getOperand(0).getReg();
4746 LLT DstTy = MRI.getType(Dst);
4747 LLT S16 = LLT::scalar(16);
4748 LLT S32 = LLT::scalar(32);
4749 LLT S64 = LLT::scalar(64);
4750
4751 if (DstTy == S16)
4752 return legalizeFDIV16(MI, MRI, B);
4753 if (DstTy == S32)
4754 return legalizeFDIV32(MI, MRI, B);
4755 if (DstTy == S64)
4756 return legalizeFDIV64(MI, MRI, B);
4757
4758 return false;
4759}
4760
4762 Register DstDivReg,
4763 Register DstRemReg,
4764 Register X,
4765 Register Y) const {
4766 const LLT S1 = LLT::scalar(1);
4767 const LLT S32 = LLT::scalar(32);
4768
4769 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4770 // algorithm used here.
4771
4772 // Initial estimate of inv(y).
4773 auto FloatY = B.buildUITOFP(S32, Y);
4774 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4775 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4776 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4777 auto Z = B.buildFPTOUI(S32, ScaledY);
4778
4779 // One round of UNR.
4780 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4781 auto NegYZ = B.buildMul(S32, NegY, Z);
4782 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4783
4784 // Quotient/remainder estimate.
4785 auto Q = B.buildUMulH(S32, X, Z);
4786 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4787
4788 // First quotient/remainder refinement.
4789 auto One = B.buildConstant(S32, 1);
4790 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4791 if (DstDivReg)
4792 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4793 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4794
4795 // Second quotient/remainder refinement.
4796 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4797 if (DstDivReg)
4798 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4799
4800 if (DstRemReg)
4801 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4802}
4803
4804// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4805//
4806// Return lo, hi of result
4807//
4808// %cvt.lo = G_UITOFP Val.lo
4809// %cvt.hi = G_UITOFP Val.hi
4810// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4811// %rcp = G_AMDGPU_RCP_IFLAG %mad
4812// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4813// %mul2 = G_FMUL %mul1, 2**(-32)
4814// %trunc = G_INTRINSIC_TRUNC %mul2
4815// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4816// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4817static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4818 Register Val) {
4819 const LLT S32 = LLT::scalar(32);
4820 auto Unmerge = B.buildUnmerge(S32, Val);
4821
4822 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4823 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4824
4825 auto Mad = B.buildFMAD(
4826 S32, CvtHi, // 2**32
4827 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4828
4829 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4830 auto Mul1 = B.buildFMul(
4831 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4832
4833 // 2**(-32)
4834 auto Mul2 = B.buildFMul(
4835 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4836 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4837
4838 // -(2**32)
4839 auto Mad2 = B.buildFMAD(
4840 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4841 Mul1);
4842
4843 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4844 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4845
4846 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4847}
4848
4850 Register DstDivReg,
4851 Register DstRemReg,
4852 Register Numer,
4853 Register Denom) const {
4854 const LLT S32 = LLT::scalar(32);
4855 const LLT S64 = LLT::scalar(64);
4856 const LLT S1 = LLT::scalar(1);
4857 Register RcpLo, RcpHi;
4858
4859 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4860
4861 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4862
4863 auto Zero64 = B.buildConstant(S64, 0);
4864 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4865
4866 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4867 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4868
4869 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4870 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4871 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4872
4873 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4874 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4875 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4876
4877 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4878 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4879 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4880 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4881 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4882
4883 auto Zero32 = B.buildConstant(S32, 0);
4884 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4885 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4886 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4887
4888 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4889 Register NumerLo = UnmergeNumer.getReg(0);
4890 Register NumerHi = UnmergeNumer.getReg(1);
4891
4892 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4893 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4894 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4895 Register Mul3_Lo = UnmergeMul3.getReg(0);
4896 Register Mul3_Hi = UnmergeMul3.getReg(1);
4897 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4898 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4899 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4900 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4901
4902 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4903 Register DenomLo = UnmergeDenom.getReg(0);
4904 Register DenomHi = UnmergeDenom.getReg(1);
4905
4906 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4907 auto C1 = B.buildSExt(S32, CmpHi);
4908
4909 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4910 auto C2 = B.buildSExt(S32, CmpLo);
4911
4912 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4913 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4914
4915 // TODO: Here and below portions of the code can be enclosed into if/endif.
4916 // Currently control flow is unconditional and we have 4 selects after
4917 // potential endif to substitute PHIs.
4918
4919 // if C3 != 0 ...
4920 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4921 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4922 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4923 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4924
4925 auto One64 = B.buildConstant(S64, 1);
4926 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4927
4928 auto C4 =
4929 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4930 auto C5 =
4931 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4932 auto C6 = B.buildSelect(
4933 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4934
4935 // if (C6 != 0)
4936 auto Add4 = B.buildAdd(S64, Add3, One64);
4937 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4938
4939 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4940 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4941 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4942
4943 // endif C6
4944 // endif C3
4945
4946 if (DstDivReg) {
4947 auto Sel1 = B.buildSelect(
4948 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4949 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4950 Sel1, MulHi3);
4951 }
4952
4953 if (DstRemReg) {
4954 auto Sel2 = B.buildSelect(
4955 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4956 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4957 Sel2, Sub1);
4958 }
4959}
4960
4963 MachineIRBuilder &B) const {
4964 Register DstDivReg, DstRemReg;
4965 switch (MI.getOpcode()) {
4966 default:
4967 llvm_unreachable("Unexpected opcode!");
4968 case AMDGPU::G_UDIV: {
4969 DstDivReg = MI.getOperand(0).getReg();
4970 break;
4971 }
4972 case AMDGPU::G_UREM: {
4973 DstRemReg = MI.getOperand(0).getReg();
4974 break;
4975 }
4976 case AMDGPU::G_UDIVREM: {
4977 DstDivReg = MI.getOperand(0).getReg();
4978 DstRemReg = MI.getOperand(1).getReg();
4979 break;
4980 }
4981 }
4982
4983 const LLT S64 = LLT::scalar(64);
4984 const LLT S32 = LLT::scalar(32);
4985 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4986 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4987 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4988 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4989
4990 if (Ty == S32)
4991 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4992 else if (Ty == S64)
4993 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4994 else
4995 return false;
4996
4997 MI.eraseFromParent();
4998 return true;
4999}
5000
5003 MachineIRBuilder &B) const {
5004 const LLT S64 = LLT::scalar(64);
5005 const LLT S32 = LLT::scalar(32);
5006
5007 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5008 if (Ty != S32 && Ty != S64)
5009 return false;
5010
5011 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5012 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5013 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5014
5015 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5016 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5017 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5018
5019 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5020 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5021
5022 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5023 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5024
5025 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5026 switch (MI.getOpcode()) {
5027 default:
5028 llvm_unreachable("Unexpected opcode!");
5029 case AMDGPU::G_SDIV: {
5030 DstDivReg = MI.getOperand(0).getReg();
5031 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5032 break;
5033 }
5034 case AMDGPU::G_SREM: {
5035 DstRemReg = MI.getOperand(0).getReg();
5036 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5037 break;
5038 }
5039 case AMDGPU::G_SDIVREM: {
5040 DstDivReg = MI.getOperand(0).getReg();
5041 DstRemReg = MI.getOperand(1).getReg();
5042 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5043 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5044 break;
5045 }
5046 }
5047
5048 if (Ty == S32)
5049 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5050 else
5051 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5052
5053 if (DstDivReg) {
5054 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5055 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5056 B.buildSub(DstDivReg, SignXor, Sign);
5057 }
5058
5059 if (DstRemReg) {
5060 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5061 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5062 B.buildSub(DstRemReg, SignXor, Sign);
5063 }
5064
5065 MI.eraseFromParent();
5066 return true;
5067}
5068
5071 MachineIRBuilder &B) const {
5072 Register Res = MI.getOperand(0).getReg();
5073 Register LHS = MI.getOperand(1).getReg();
5074 Register RHS = MI.getOperand(2).getReg();
5075 uint16_t Flags = MI.getFlags();
5076 LLT ResTy = MRI.getType(Res);
5077
5078 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5079
5080 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5081 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5082 return false;
5083
5084 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5085 // the CI documentation has a worst case error of 1 ulp.
5086 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5087 // use it as long as we aren't trying to use denormals.
5088 //
5089 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5090
5091 // 1 / x -> RCP(x)
5092 if (CLHS->isExactlyValue(1.0)) {
5093 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5094 .addUse(RHS)
5095 .setMIFlags(Flags);
5096
5097 MI.eraseFromParent();
5098 return true;
5099 }
5100
5101 // -1 / x -> RCP( FNEG(x) )
5102 if (CLHS->isExactlyValue(-1.0)) {
5103 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5104 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5105 .addUse(FNeg.getReg(0))
5106 .setMIFlags(Flags);
5107
5108 MI.eraseFromParent();
5109 return true;
5110 }
5111 }
5112
5113 // For f16 require afn or arcp.
5114 // For f32 require afn.
5115 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5116 !MI.getFlag(MachineInstr::FmArcp)))
5117 return false;
5118
5119 // x / y -> x * (1.0 / y)
5120 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5121 .addUse(RHS)
5122 .setMIFlags(Flags);
5123 B.buildFMul(Res, LHS, RCP, Flags);
5124
5125 MI.eraseFromParent();
5126 return true;
5127}
5128
5131 MachineIRBuilder &B) const {
5132 Register Res = MI.getOperand(0).getReg();
5133 Register X = MI.getOperand(1).getReg();
5134 Register Y = MI.getOperand(2).getReg();
5135 uint16_t Flags = MI.getFlags();
5136 LLT ResTy = MRI.getType(Res);
5137
5138 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5139
5140 if (!AllowInaccurateRcp)
5141 return false;
5142
5143 auto NegY = B.buildFNeg(ResTy, Y);
5144 auto One = B.buildFConstant(ResTy, 1.0);
5145
5146 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5147 .addUse(Y)
5148 .setMIFlags(Flags);
5149
5150 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5151 R = B.buildFMA(ResTy, Tmp0, R, R);
5152
5153 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5154 R = B.buildFMA(ResTy, Tmp1, R, R);
5155
5156 auto Ret = B.buildFMul(ResTy, X, R);
5157 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5158
5159 B.buildFMA(Res, Tmp2, R, Ret);
5160 MI.eraseFromParent();
5161 return true;
5162}
5163
5166 MachineIRBuilder &B) const {
5168 return true;
5169
5170 Register Res = MI.getOperand(0).getReg();
5171 Register LHS = MI.getOperand(1).getReg();
5172 Register RHS = MI.getOperand(2).getReg();
5173
5174 uint16_t Flags = MI.getFlags();
5175
5176 LLT S16 = LLT::scalar(16);
5177 LLT S32 = LLT::scalar(32);
5178
5179 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5180 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5181 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5182 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5183 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5184 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5185 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5186 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5187 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5188 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5189 // q16.u = opx(V_CVT_F16_F32, q32.u);
5190 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5191
5192 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5193 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5194 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5195 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5196 .addUse(RHSExt.getReg(0))
5197 .setMIFlags(Flags);
5198 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5200 if (ST.hasMadMacF32Insts()) {
5201 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5202 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5203 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5204 } else {
5205 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5206 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5207 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5208 }
5209 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5210 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5211 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5212 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5213 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5214 .addUse(RDst.getReg(0))
5215 .addUse(RHS)
5216 .addUse(LHS)
5217 .setMIFlags(Flags);
5218
5219 MI.eraseFromParent();
5220 return true;
5221}
5222
5223static constexpr unsigned SPDenormModeBitField =
5225
5226// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5227// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5229 const GCNSubtarget &ST,
5231 // Set SP denorm mode to this value.
5232 unsigned SPDenormMode =
5233 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5234
5235 if (ST.hasDenormModeInst()) {
5236 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5237 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5238
5239 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5240 B.buildInstr(AMDGPU::S_DENORM_MODE)
5241 .addImm(NewDenormModeValue);
5242
5243 } else {
5244 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5245 .addImm(SPDenormMode)
5246 .addImm(SPDenormModeBitField);
5247 }
5248}
5249
5252 MachineIRBuilder &B) const {
5254 return true;
5255
5256 Register Res = MI.getOperand(0).getReg();
5257 Register LHS = MI.getOperand(1).getReg();
5258 Register RHS = MI.getOperand(2).getReg();
5259 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5260 SIModeRegisterDefaults Mode = MFI->getMode();
5261
5262 uint16_t Flags = MI.getFlags();
5263
5264 LLT S32 = LLT::scalar(32);
5265 LLT S1 = LLT::scalar(1);
5266
5267 auto One = B.buildFConstant(S32, 1.0f);
5268
5269 auto DenominatorScaled =
5270 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5271 .addUse(LHS)
5272 .addUse(RHS)
5273 .addImm(0)
5274 .setMIFlags(Flags);
5275 auto NumeratorScaled =
5276 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5277 .addUse(LHS)
5278 .addUse(RHS)
5279 .addImm(1)
5280 .setMIFlags(Flags);
5281
5282 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5283 .addUse(DenominatorScaled.getReg(0))
5284 .setMIFlags(Flags);
5285 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5286
5287 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5288 const bool HasDynamicDenormals =
5289 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5290 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5291
5292 Register SavedSPDenormMode;
5293 if (!PreservesDenormals) {
5294 if (HasDynamicDenormals) {
5295 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5296 B.buildInstr(AMDGPU::S_GETREG_B32)
5297 .addDef(SavedSPDenormMode)
5298 .addImm(SPDenormModeBitField);
5299 }
5300 toggleSPDenormMode(true, B, ST, Mode);
5301 }
5302
5303 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5304 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5305 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5306 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5307 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5308 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5309
5310 if (!PreservesDenormals) {
5311 if (HasDynamicDenormals) {
5312 assert(SavedSPDenormMode);
5313 B.buildInstr(AMDGPU::S_SETREG_B32)
5314 .addReg(SavedSPDenormMode)
5315 .addImm(SPDenormModeBitField);
5316 } else
5317 toggleSPDenormMode(false, B, ST, Mode);
5318 }
5319
5320 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5321 .addUse(Fma4.getReg(0))
5322 .addUse(Fma1.getReg(0))
5323 .addUse(Fma3.getReg(0))
5324 .addUse(NumeratorScaled.getReg(1))
5325 .setMIFlags(Flags);
5326
5327 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5328 .addUse(Fmas.getReg(0))
5329 .addUse(RHS)
5330 .addUse(LHS)
5331 .setMIFlags(Flags);
5332
5333 MI.eraseFromParent();
5334 return true;
5335}
5336
5339 MachineIRBuilder &B) const {
5341 return true;
5342
5343 Register Res = MI.getOperand(0).getReg();
5344 Register LHS = MI.getOperand(1).getReg();
5345 Register RHS = MI.getOperand(2).getReg();
5346
5347 uint16_t Flags = MI.getFlags();
5348
5349 LLT S64 = LLT::scalar(64);
5350 LLT S1 = LLT::scalar(1);
5351
5352 auto One = B.buildFConstant(S64, 1.0);
5353
5354 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5355 .addUse(LHS)
5356 .addUse(RHS)
5357 .addImm(0)
5358 .setMIFlags(Flags);
5359
5360 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5361
5362 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5363 .addUse(DivScale0.getReg(0))
5364 .setMIFlags(Flags);
5365
5366 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5367 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5368 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5369
5370 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5371 .addUse(LHS)
5372 .addUse(RHS)
5373 .addImm(1)
5374 .setMIFlags(Flags);
5375
5376 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5377 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5378 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5379
5380 Register Scale;
5381 if (!ST.hasUsableDivScaleConditionOutput()) {
5382 // Workaround a hardware bug on SI where the condition output from div_scale
5383 // is not usable.
5384
5385 LLT S32 = LLT::scalar(32);
5386
5387 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5388 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5389 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5390 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5391
5392 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5393 Scale1Unmerge.getReg(1));
5394 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5395 Scale0Unmerge.getReg(1));
5396 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5397 } else {
5398 Scale = DivScale1.getReg(1);
5399 }
5400
5401 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5402 .addUse(Fma4.getReg(0))
5403 .addUse(Fma3.getReg(0))
5404 .addUse(Mul.getReg(0))
5405 .addUse(Scale)
5406 .setMIFlags(Flags);
5407
5408 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5409 .addUse(Fmas.getReg(0))
5410 .addUse(RHS)
5411 .addUse(LHS)
5412 .setMIFlags(Flags);
5413
5414 MI.eraseFromParent();
5415 return true;
5416}
5417
5420 MachineIRBuilder &B) const {
5421 Register Res0 = MI.getOperand(0).getReg();
5422 Register Res1 = MI.getOperand(1).getReg();
5423 Register Val = MI.getOperand(2).getReg();
5424 uint16_t Flags = MI.getFlags();
5425
5426 LLT Ty = MRI.getType(Res0);
5427 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5428
5429 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5430 .addUse(Val)
5431 .setMIFlags(Flags);
5432 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5433 .addUse(Val)
5434 .setMIFlags(Flags);
5435
5436 if (ST.hasFractBug()) {
5437 auto Fabs = B.buildFAbs(Ty, Val);
5438 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5439 auto IsFinite =
5440 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5441 auto Zero = B.buildConstant(InstrExpTy, 0);
5442 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5443 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5444 }
5445
5446 B.buildCopy(Res0, Mant);
5447 B.buildSExtOrTrunc(Res1, Exp);
5448
5449 MI.eraseFromParent();
5450 return true;
5451}
5452
5455 MachineIRBuilder &B) const {
5456 Register Res = MI.getOperand(0).getReg();
5457 Register LHS = MI.getOperand(2).getReg();
5458 Register RHS = MI.getOperand(3).getReg();
5459 uint16_t Flags = MI.getFlags();
5460
5461 LLT S32 = LLT::scalar(32);
5462 LLT S1 = LLT::scalar(1);
5463
5464 auto Abs = B.buildFAbs(S32, RHS, Flags);
5465 const APFloat C0Val(1.0f);
5466
5467 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5468 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5469 auto C2 = B.buildFConstant(S32, 1.0f);
5470
5471 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5472 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5473
5474 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5475
5476 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5477 .addUse(Mul0.getReg(0))
5478 .setMIFlags(Flags);
5479
5480 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5481
5482 B.buildFMul(Res, Sel, Mul1, Flags);
5483
5484 MI.eraseFromParent();
5485 return true;
5486}
5487
5490 MachineIRBuilder &B) const {
5491 // Bypass the correct expansion a standard promotion through G_FSQRT would
5492 // get. The f32 op is accurate enough for the f16 cas.
5493 unsigned Flags = MI.getFlags();
5494 assert(!ST.has16BitInsts());
5495 const LLT F32 = LLT::scalar(32);
5496 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5497 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5498 .addUse(Ext.getReg(0))
5499 .setMIFlags(Flags);
5500 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5501 MI.eraseFromParent();
5502 return true;
5503}
5504
5507 MachineIRBuilder &B) const {
5508 MachineFunction &MF = B.getMF();
5509 Register Dst = MI.getOperand(0).getReg();
5510 Register X = MI.getOperand(1).getReg();
5511 const unsigned Flags = MI.getFlags();
5512 const LLT S1 = LLT::scalar(1);
5513 const LLT F32 = LLT::scalar(32);
5514 const LLT I32 = LLT::scalar(32);
5515
5516 if (allowApproxFunc(MF, Flags)) {
5517 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5518 .addUse(X)
5519 .setMIFlags(Flags);
5520 MI.eraseFromParent();
5521 return true;
5522 }
5523
5524 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5525 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5526 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5527 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5528 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5529
5530 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5531 if (needsDenormHandlingF32(MF, X, Flags)) {
5532 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5533 .addUse(SqrtX.getReg(0))
5534 .setMIFlags(Flags);
5535
5536 auto NegOne = B.buildConstant(I32, -1);
5537 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5538
5539 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5540 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5541
5542 auto PosOne = B.buildConstant(I32, 1);
5543 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5544
5545 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5546 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5547
5548 auto Zero = B.buildFConstant(F32, 0.0f);
5549 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5550
5551 SqrtS =
5552 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5553
5554 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5555 SqrtS =
5556 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5557 } else {
5558 auto SqrtR =
5559 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5560 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5561
5562 auto Half = B.buildFConstant(F32, 0.5f);
5563 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5564 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5565 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5566 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5567 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5568 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5569 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5570 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5571 }
5572
5573 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5574
5575 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5576
5577 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5578
5579 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5580 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5581
5582 MI.eraseFromParent();
5583 return true;
5584}
5585
5588 MachineIRBuilder &B) const {
5589 // For double type, the SQRT and RSQ instructions don't have required
5590 // precision, we apply Goldschmidt's algorithm to improve the result:
5591 //
5592 // y0 = rsq(x)
5593 // g0 = x * y0
5594 // h0 = 0.5 * y0
5595 //
5596 // r0 = 0.5 - h0 * g0
5597 // g1 = g0 * r0 + g0
5598 // h1 = h0 * r0 + h0
5599 //
5600 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5601 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5602 // h2 = h1 * r1 + h1
5603 //
5604 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5605 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5606 //
5607 // sqrt(x) = g3
5608
5609 const LLT S1 = LLT::scalar(1);
5610 const LLT S32 = LLT::scalar(32);
5611 const LLT F64 = LLT::scalar(64);
5612
5613 Register Dst = MI.getOperand(0).getReg();
5614 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5615
5616 Register X = MI.getOperand(1).getReg();
5617 unsigned Flags = MI.getFlags();
5618
5619 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5620
5621 auto ZeroInt = B.buildConstant(S32, 0);
5622 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5623
5624 // Scale up input if it is too small.
5625 auto ScaleUpFactor = B.buildConstant(S32, 256);
5626 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5627 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5628
5629 auto SqrtY =
5630 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5631
5632 auto Half = B.buildFConstant(F64, 0.5);
5633 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5634 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5635
5636 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5637 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5638
5639 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5640 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5641
5642 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5643 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5644
5645 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5646
5647 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5648 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5649
5650 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5651
5652 // Scale down the result.
5653 auto ScaleDownFactor = B.buildConstant(S32, -128);
5654 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5655 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5656
5657 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5658 // with finite only or nsz because rsq(+/-0) = +/-inf
5659
5660 // TODO: Check for DAZ and expand to subnormals
5661 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5662
5663 // If x is +INF, +0, or -0, use its original value
5664 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5665
5666 MI.eraseFromParent();
5667 return true;
5668}
5669
5672 MachineIRBuilder &B) const {
5673 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5674 if (Ty == LLT::scalar(32))
5675 return legalizeFSQRTF32(MI, MRI, B);
5676 if (Ty == LLT::scalar(64))
5677 return legalizeFSQRTF64(MI, MRI, B);
5678 if (Ty == LLT::scalar(16))
5679 return legalizeFSQRTF16(MI, MRI, B);
5680 return false;
5681}
5682
5683// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5684// FIXME: Why do we handle this one but not other removed instructions?
5685//
5686// Reciprocal square root. The clamp prevents infinite results, clamping
5687// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5688// +-max_float.
5691 MachineIRBuilder &B) const {
5692 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5693 return true;
5694
5695 Register Dst = MI.getOperand(0).getReg();
5696 Register Src = MI.getOperand(2).getReg();
5697 auto Flags = MI.getFlags();
5698
5699 LLT Ty = MRI.getType(Dst);
5700
5701 const fltSemantics *FltSemantics;
5702 if (Ty == LLT::scalar(32))
5703 FltSemantics = &APFloat::IEEEsingle();
5704 else if (Ty == LLT::scalar(64))
5705 FltSemantics = &APFloat::IEEEdouble();
5706 else
5707 return false;
5708
5709 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5710 .addUse(Src)
5711 .setMIFlags(Flags);
5712
5713 // We don't need to concern ourselves with the snan handling difference, since
5714 // the rsq quieted (or not) so use the one which will directly select.
5715 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5716 const bool UseIEEE = MFI->getMode().IEEE;
5717
5718 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5719 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5720 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5721
5722 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5723
5724 if (UseIEEE)
5725 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5726 else
5727 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5728 MI.eraseFromParent();
5729 return true;
5730}
5731
5732// TODO: Fix pointer type handling
5735 Intrinsic::ID IID) const {
5736
5737 MachineIRBuilder &B = Helper.MIRBuilder;
5738 MachineRegisterInfo &MRI = *B.getMRI();
5739
5740 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5741 IID == Intrinsic::amdgcn_permlanex16;
5742 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5743 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5744
5745 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5746 Register Src2, LLT VT) -> Register {
5747 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5748 switch (IID) {
5749 case Intrinsic::amdgcn_readfirstlane:
5750 case Intrinsic::amdgcn_permlane64:
5751 return LaneOp.getReg(0);
5752 case Intrinsic::amdgcn_readlane:
5753 case Intrinsic::amdgcn_set_inactive:
5754 case Intrinsic::amdgcn_set_inactive_chain_arg:
5755 return LaneOp.addUse(Src1).getReg(0);
5756 case Intrinsic::amdgcn_writelane:
5757 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5758 case Intrinsic::amdgcn_permlane16:
5759 case Intrinsic::amdgcn_permlanex16: {
5760 Register Src3 = MI.getOperand(5).getReg();
5761 int64_t Src4 = MI.getOperand(6).getImm();
5762 int64_t Src5 = MI.getOperand(7).getImm();
5763 return LaneOp.addUse(Src1)
5764 .addUse(Src2)
5765 .addUse(Src3)
5766 .addImm(Src4)
5767 .addImm(Src5)
5768 .getReg(0);
5769 }
5770 case Intrinsic::amdgcn_mov_dpp8:
5771 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
5772 case Intrinsic::amdgcn_update_dpp:
5773 return LaneOp.addUse(Src1)
5774 .addImm(MI.getOperand(4).getImm())
5775 .addImm(MI.getOperand(5).getImm())
5776 .addImm(MI.getOperand(6).getImm())
5777 .addImm(MI.getOperand(7).getImm())
5778 .getReg(0);
5779 default:
5780 llvm_unreachable("unhandled lane op");
5781 }
5782 };
5783
5784 Register DstReg = MI.getOperand(0).getReg();
5785 Register Src0 = MI.getOperand(2).getReg();
5786 Register Src1, Src2;
5787 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5788 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5789 Src1 = MI.getOperand(3).getReg();
5790 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5791 Src2 = MI.getOperand(4).getReg();
5792 }
5793 }
5794
5795 LLT Ty = MRI.getType(DstReg);
5796 unsigned Size = Ty.getSizeInBits();
5797
5798 unsigned SplitSize = 32;
5799 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5800 ST.hasDPALU_DPP() &&
5801 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
5802 SplitSize = 64;
5803
5804 if (Size == SplitSize) {
5805 // Already legal
5806 return true;
5807 }
5808
5809 if (Size < 32) {
5810 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5811
5812 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5813 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5814
5815 if (IID == Intrinsic::amdgcn_writelane)
5816 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5817
5818 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5819 B.buildTrunc(DstReg, LaneOpDst);
5820 MI.eraseFromParent();
5821 return true;
5822 }
5823
5824 if (Size % SplitSize != 0)
5825 return false;
5826
5827 LLT PartialResTy = LLT::scalar(SplitSize);
5828 bool NeedsBitcast = false;
5829 if (Ty.isVector()) {
5830 LLT EltTy = Ty.getElementType();
5831 unsigned EltSize = EltTy.getSizeInBits();
5832 if (EltSize == SplitSize) {
5833 PartialResTy = EltTy;
5834 } else if (EltSize == 16 || EltSize == 32) {
5835 unsigned NElem = SplitSize / EltSize;
5836 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
5837 } else {
5838 // Handle all other cases via S32/S64 pieces
5839 NeedsBitcast = true;
5840 }
5841 }
5842
5843 SmallVector<Register, 4> PartialRes;
5844 unsigned NumParts = Size / SplitSize;
5845 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5846 MachineInstrBuilder Src1Parts, Src2Parts;
5847
5848 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5849 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5850
5851 if (IID == Intrinsic::amdgcn_writelane)
5852 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5853
5854 for (unsigned i = 0; i < NumParts; ++i) {
5855 Src0 = Src0Parts.getReg(i);
5856
5857 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5858 Src1 = Src1Parts.getReg(i);
5859
5860 if (IID == Intrinsic::amdgcn_writelane)
5861 Src2 = Src2Parts.getReg(i);
5862
5863 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5864 }
5865
5866 if (NeedsBitcast)
5867 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
5868 LLT::scalar(Ty.getSizeInBits()), PartialRes));
5869 else
5870 B.buildMergeLikeInstr(DstReg, PartialRes);
5871
5872 MI.eraseFromParent();
5873 return true;
5874}
5875
5878 MachineIRBuilder &B) const {
5880 ST.getTargetLowering()->getImplicitParameterOffset(
5882 LLT DstTy = MRI.getType(DstReg);
5883 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5884
5885 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5886 if (!loadInputValue(KernargPtrReg, B,
5888 return false;
5889
5890 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
5891 B.buildConstant(IdxTy, Offset).getReg(0));
5892 return true;
5893}
5894
5895/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5896/// bits of the pointer and replace them with the stride argument, then
5897/// merge_values everything together. In the common case of a raw buffer (the
5898/// stride component is 0), we can just AND off the upper half.
5901 Register Result = MI.getOperand(0).getReg();
5902 Register Pointer = MI.getOperand(2).getReg();
5903 Register Stride = MI.getOperand(3).getReg();
5904 Register NumRecords = MI.getOperand(4).getReg();
5905 Register Flags = MI.getOperand(5).getReg();
5906
5907 LLT S32 = LLT::scalar(32);
5908 LLT S64 = LLT::scalar(64);
5909
5910 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5911
5912 auto ExtStride = B.buildAnyExt(S32, Stride);
5913
5914 if (ST.has45BitNumRecordsBufferResource()) {
5915 Register Zero = B.buildConstant(S32, 0).getReg(0);
5916 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
5917 // num_records.
5918 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
5919 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
5920 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
5921 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
5922 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
5923
5924 // Build the higher 64-bit value, which has the higher 38-bit num_records,
5925 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
5926 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
5927 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
5928 auto ExtShiftedStride =
5929 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
5930 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
5931 auto ExtShiftedFlags =
5932 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
5933 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
5934 Register HighHalf =
5935 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
5936 B.buildMergeValues(Result, {LowHalf, HighHalf});
5937 } else {
5938 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
5939 auto Unmerge = B.buildUnmerge(S32, Pointer);
5940 auto LowHalf = Unmerge.getReg(0);
5941 auto HighHalf = Unmerge.getReg(1);
5942
5943 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5944 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5945 auto ShiftConst = B.buildConstant(S32, 16);
5946 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5947 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5948 Register NewHighHalfReg = NewHighHalf.getReg(0);
5949 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5950 }
5951
5952 MI.eraseFromParent();
5953 return true;
5954}
5955
5958 MachineIRBuilder &B) const {
5959 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5960 if (!MFI->isEntryFunction()) {
5963 }
5964
5965 Register DstReg = MI.getOperand(0).getReg();
5966 if (!getImplicitArgPtr(DstReg, MRI, B))
5967 return false;
5968
5969 MI.eraseFromParent();
5970 return true;
5971}
5972
5975 MachineIRBuilder &B) const {
5976 Function &F = B.getMF().getFunction();
5977 std::optional<uint32_t> KnownSize =
5979 if (KnownSize.has_value())
5980 B.buildConstant(DstReg, *KnownSize);
5981 return false;
5982}
5983
5986 MachineIRBuilder &B) const {
5987
5988 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5989 if (!MFI->isEntryFunction()) {
5992 }
5993
5994 Register DstReg = MI.getOperand(0).getReg();
5995 if (!getLDSKernelId(DstReg, MRI, B))
5996 return false;
5997
5998 MI.eraseFromParent();
5999 return true;
6000}
6001
6005 unsigned AddrSpace) const {
6006 const LLT S32 = LLT::scalar(32);
6007 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6008 Register Hi32 = Unmerge.getReg(1);
6009
6010 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6011 ST.hasGloballyAddressableScratch()) {
6012 Register FlatScratchBaseHi =
6013 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6014 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6015 .getReg(0);
6016 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6017 // Test bits 63..58 against the aperture address.
6018 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6019 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6020 B.buildConstant(S32, 1u << 26));
6021 } else {
6022 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6023 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6024 }
6025 MI.eraseFromParent();
6026 return true;
6027}
6028
6029// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6030// offset (the offset that is included in bounds checking and swizzling, to be
6031// split between the instruction's voffset and immoffset fields) and soffset
6032// (the offset that is excluded from bounds checking and swizzling, to go in
6033// the instruction's soffset field). This function takes the first kind of
6034// offset and figures out how to split it between voffset and immoffset.
6035std::pair<Register, unsigned>
6037 Register OrigOffset) const {
6038 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6039 Register BaseReg;
6040 unsigned ImmOffset;
6041 const LLT S32 = LLT::scalar(32);
6042 MachineRegisterInfo &MRI = *B.getMRI();
6043
6044 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6045 // being added, so we can only safely match a 32-bit addition with no unsigned
6046 // overflow.
6047 bool CheckNUW = AMDGPU::isGFX1250(ST);
6048 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6049 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6050
6051 // If BaseReg is a pointer, convert it to int.
6052 if (MRI.getType(BaseReg).isPointer())
6053 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6054
6055 // If the immediate value is too big for the immoffset field, put only bits
6056 // that would normally fit in the immoffset field. The remaining value that
6057 // is copied/added for the voffset field is a large power of 2, and it
6058 // stands more chance of being CSEd with the copy/add for another similar
6059 // load/store.
6060 // However, do not do that rounding down if that is a negative
6061 // number, as it appears to be illegal to have a negative offset in the
6062 // vgpr, even if adding the immediate offset makes it positive.
6063 unsigned Overflow = ImmOffset & ~MaxImm;
6064 ImmOffset -= Overflow;
6065 if ((int32_t)Overflow < 0) {
6066 Overflow += ImmOffset;
6067 ImmOffset = 0;
6068 }
6069
6070 if (Overflow != 0) {
6071 if (!BaseReg) {
6072 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6073 } else {
6074 auto OverflowVal = B.buildConstant(S32, Overflow);
6075 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6076 }
6077 }
6078
6079 if (!BaseReg)
6080 BaseReg = B.buildConstant(S32, 0).getReg(0);
6081
6082 return std::pair(BaseReg, ImmOffset);
6083}
6084
6085/// Handle register layout difference for f16 images for some subtargets.
6088 Register Reg,
6089 bool ImageStore) const {
6090 const LLT S16 = LLT::scalar(16);
6091 const LLT S32 = LLT::scalar(32);
6092 LLT StoreVT = MRI.getType(Reg);
6093 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6094
6095 if (ST.hasUnpackedD16VMem()) {
6096 auto Unmerge = B.buildUnmerge(S16, Reg);
6097
6098 SmallVector<Register, 4> WideRegs;
6099 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6100 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6101
6102 int NumElts = StoreVT.getNumElements();
6103
6104 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6105 .getReg(0);
6106 }
6107
6108 if (ImageStore && ST.hasImageStoreD16Bug()) {
6109 if (StoreVT.getNumElements() == 2) {
6110 SmallVector<Register, 4> PackedRegs;
6111 Reg = B.buildBitcast(S32, Reg).getReg(0);
6112 PackedRegs.push_back(Reg);
6113 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6114 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6115 .getReg(0);
6116 }
6117
6118 if (StoreVT.getNumElements() == 3) {
6119 SmallVector<Register, 4> PackedRegs;
6120 auto Unmerge = B.buildUnmerge(S16, Reg);
6121 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6122 PackedRegs.push_back(Unmerge.getReg(I));
6123 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6124 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6125 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6126 }
6127
6128 if (StoreVT.getNumElements() == 4) {
6129 SmallVector<Register, 4> PackedRegs;
6130 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6131 auto Unmerge = B.buildUnmerge(S32, Reg);
6132 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6133 PackedRegs.push_back(Unmerge.getReg(I));
6134 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6135 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6136 .getReg(0);
6137 }
6138
6139 llvm_unreachable("invalid data type");
6140 }
6141
6142 if (StoreVT == LLT::fixed_vector(3, S16)) {
6143 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6144 .getReg(0);
6145 }
6146 return Reg;
6147}
6148
6150 Register VData, LLT MemTy,
6151 bool IsFormat) const {
6152 MachineRegisterInfo *MRI = B.getMRI();
6153 LLT Ty = MRI->getType(VData);
6154
6155 const LLT S16 = LLT::scalar(16);
6156
6157 // Fixup buffer resources themselves needing to be v4i128.
6159 return castBufferRsrcToV4I32(VData, B);
6160
6161 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6162 Ty = getBitcastRegisterType(Ty);
6163 VData = B.buildBitcast(Ty, VData).getReg(0);
6164 }
6165 // Fixup illegal register types for i8 stores.
6166 if (Ty == LLT::scalar(8) || Ty == S16) {
6167 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6168 return AnyExt;
6169 }
6170
6171 if (Ty.isVector()) {
6172 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6173 if (IsFormat)
6174 return handleD16VData(B, *MRI, VData);
6175 }
6176 }
6177
6178 return VData;
6179}
6180
6182 LegalizerHelper &Helper,
6183 bool IsTyped,
6184 bool IsFormat) const {
6185 MachineIRBuilder &B = Helper.MIRBuilder;
6186 MachineRegisterInfo &MRI = *B.getMRI();
6187
6188 Register VData = MI.getOperand(1).getReg();
6189 LLT Ty = MRI.getType(VData);
6190 LLT EltTy = Ty.getScalarType();
6191 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6192 const LLT S32 = LLT::scalar(32);
6193
6194 MachineMemOperand *MMO = *MI.memoperands_begin();
6195 const int MemSize = MMO->getSize().getValue();
6196 LLT MemTy = MMO->getMemoryType();
6197
6198 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6199
6201 Register RSrc = MI.getOperand(2).getReg();
6202
6203 unsigned ImmOffset;
6204
6205 // The typed intrinsics add an immediate after the registers.
6206 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6207
6208 // The struct intrinsic variants add one additional operand over raw.
6209 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6210 Register VIndex;
6211 int OpOffset = 0;
6212 if (HasVIndex) {
6213 VIndex = MI.getOperand(3).getReg();
6214 OpOffset = 1;
6215 } else {
6216 VIndex = B.buildConstant(S32, 0).getReg(0);
6217 }
6218
6219 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6220 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6221
6222 unsigned Format = 0;
6223 if (IsTyped) {
6224 Format = MI.getOperand(5 + OpOffset).getImm();
6225 ++OpOffset;
6226 }
6227
6228 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6229
6230 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6231
6232 unsigned Opc;
6233 if (IsTyped) {
6234 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6235 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6236 } else if (IsFormat) {
6237 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6238 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6239 } else {
6240 switch (MemSize) {
6241 case 1:
6242 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6243 break;
6244 case 2:
6245 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6246 break;
6247 default:
6248 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6249 break;
6250 }
6251 }
6252
6253 auto MIB = B.buildInstr(Opc)
6254 .addUse(VData) // vdata
6255 .addUse(RSrc) // rsrc
6256 .addUse(VIndex) // vindex
6257 .addUse(VOffset) // voffset
6258 .addUse(SOffset) // soffset
6259 .addImm(ImmOffset); // offset(imm)
6260
6261 if (IsTyped)
6262 MIB.addImm(Format);
6263
6264 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6265 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6266 .addMemOperand(MMO);
6267
6268 MI.eraseFromParent();
6269 return true;
6270}
6271
6272static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6273 Register VIndex, Register VOffset, Register SOffset,
6274 unsigned ImmOffset, unsigned Format,
6275 unsigned AuxiliaryData, MachineMemOperand *MMO,
6276 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6277 auto MIB = B.buildInstr(Opc)
6278 .addDef(LoadDstReg) // vdata
6279 .addUse(RSrc) // rsrc
6280 .addUse(VIndex) // vindex
6281 .addUse(VOffset) // voffset
6282 .addUse(SOffset) // soffset
6283 .addImm(ImmOffset); // offset(imm)
6284
6285 if (IsTyped)
6286 MIB.addImm(Format);
6287
6288 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6289 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6290 .addMemOperand(MMO);
6291}
6292
6294 LegalizerHelper &Helper,
6295 bool IsFormat,
6296 bool IsTyped) const {
6297 MachineIRBuilder &B = Helper.MIRBuilder;
6298 MachineRegisterInfo &MRI = *B.getMRI();
6299 GISelChangeObserver &Observer = Helper.Observer;
6300
6301 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6302 MachineMemOperand *MMO = *MI.memoperands_begin();
6303 const LLT MemTy = MMO->getMemoryType();
6304 const LLT S32 = LLT::scalar(32);
6305
6306 Register Dst = MI.getOperand(0).getReg();
6307
6308 Register StatusDst;
6309 int OpOffset = 0;
6310 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6311 bool IsTFE = MI.getNumExplicitDefs() == 2;
6312 if (IsTFE) {
6313 StatusDst = MI.getOperand(1).getReg();
6314 ++OpOffset;
6315 }
6316
6317 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6318 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6319
6320 // The typed intrinsics add an immediate after the registers.
6321 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6322
6323 // The struct intrinsic variants add one additional operand over raw.
6324 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6325 Register VIndex;
6326 if (HasVIndex) {
6327 VIndex = MI.getOperand(3 + OpOffset).getReg();
6328 ++OpOffset;
6329 } else {
6330 VIndex = B.buildConstant(S32, 0).getReg(0);
6331 }
6332
6333 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6334 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6335
6336 unsigned Format = 0;
6337 if (IsTyped) {
6338 Format = MI.getOperand(5 + OpOffset).getImm();
6339 ++OpOffset;
6340 }
6341
6342 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6343 unsigned ImmOffset;
6344
6345 LLT Ty = MRI.getType(Dst);
6346 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6347 // logic doesn't have to handle that case.
6348 if (hasBufferRsrcWorkaround(Ty)) {
6349 Observer.changingInstr(MI);
6350 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6351 Observer.changedInstr(MI);
6352 Dst = MI.getOperand(0).getReg();
6353 B.setInsertPt(B.getMBB(), MI);
6354 }
6355 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6356 Ty = getBitcastRegisterType(Ty);
6357 Observer.changingInstr(MI);
6358 Helper.bitcastDst(MI, Ty, 0);
6359 Observer.changedInstr(MI);
6360 Dst = MI.getOperand(0).getReg();
6361 B.setInsertPt(B.getMBB(), MI);
6362 }
6363
6364 LLT EltTy = Ty.getScalarType();
6365 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6366 const bool Unpacked = ST.hasUnpackedD16VMem();
6367
6368 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6369
6370 unsigned Opc;
6371
6372 // TODO: Support TFE for typed and narrow loads.
6373 if (IsTyped) {
6374 if (IsTFE)
6375 return false;
6376 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6377 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6378 } else if (IsFormat) {
6379 if (IsD16) {
6380 if (IsTFE)
6381 return false;
6382 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6383 } else {
6384 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6385 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6386 }
6387 } else {
6388 switch (MemTy.getSizeInBits()) {
6389 case 8:
6390 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6391 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6392 break;
6393 case 16:
6394 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6395 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6396 break;
6397 default:
6398 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6399 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6400 break;
6401 }
6402 }
6403
6404 if (IsTFE) {
6405 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6406 unsigned NumLoadDWords = NumValueDWords + 1;
6407 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6408 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6409 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6410 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6411 if (MemTy.getSizeInBits() < 32) {
6412 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6413 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6414 B.buildTrunc(Dst, ExtDst);
6415 } else if (NumValueDWords == 1) {
6416 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6417 } else {
6418 SmallVector<Register, 5> LoadElts;
6419 for (unsigned I = 0; I != NumValueDWords; ++I)
6420 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6421 LoadElts.push_back(StatusDst);
6422 B.buildUnmerge(LoadElts, LoadDstReg);
6423 LoadElts.truncate(NumValueDWords);
6424 B.buildMergeLikeInstr(Dst, LoadElts);
6425 }
6426 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6427 (IsD16 && !Ty.isVector())) {
6428 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6429 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6430 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6431 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6432 B.buildTrunc(Dst, LoadDstReg);
6433 } else if (Unpacked && IsD16 && Ty.isVector()) {
6434 LLT UnpackedTy = Ty.changeElementSize(32);
6435 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6436 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6437 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6438 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6439 // FIXME: G_TRUNC should work, but legalization currently fails
6440 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6442 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6443 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6444 B.buildMergeLikeInstr(Dst, Repack);
6445 } else {
6446 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6447 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6448 }
6449
6450 MI.eraseFromParent();
6451 return true;
6452}
6453
6454static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6455 switch (IntrID) {
6456 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6457 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6458 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6459 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6460 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6461 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6463 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6464 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6465 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6466 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6467 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6468 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6470 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6471 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6473 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6474 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6475 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6476 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6478 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6479 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6480 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6481 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6482 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6483 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6484 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6485 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6486 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6487 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6488 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6489 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6490 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6491 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6492 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6493 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6495 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6496 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6497 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6498 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6500 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6501 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6502 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6503 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6505 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6506 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6507 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6508 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6509 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6510 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6511 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6512 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6513 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6514 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6515 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6516 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6517 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6518 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6519 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6520 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6521 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6522 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6523 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6524 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6525 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6526 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6527 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6528 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6529 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6530 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6531 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6532 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6533 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6534 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6535 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6536 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6537 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6538 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6539 default:
6540 llvm_unreachable("unhandled atomic opcode");
6541 }
6542}
6543
6546 Intrinsic::ID IID) const {
6547 const bool IsCmpSwap =
6548 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6549 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6550 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6551 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6552
6553 Register Dst = MI.getOperand(0).getReg();
6554 // Since we don't have 128-bit atomics, we don't need to handle the case of
6555 // p8 argmunents to the atomic itself
6556 Register VData = MI.getOperand(2).getReg();
6557
6558 Register CmpVal;
6559 int OpOffset = 0;
6560
6561 if (IsCmpSwap) {
6562 CmpVal = MI.getOperand(3).getReg();
6563 ++OpOffset;
6564 }
6565
6566 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6567 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6568 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6569
6570 // The struct intrinsic variants add one additional operand over raw.
6571 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6572 Register VIndex;
6573 if (HasVIndex) {
6574 VIndex = MI.getOperand(4 + OpOffset).getReg();
6575 ++OpOffset;
6576 } else {
6577 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6578 }
6579
6580 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6581 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6582 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6583
6584 MachineMemOperand *MMO = *MI.memoperands_begin();
6585
6586 unsigned ImmOffset;
6587 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6588
6589 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6590 .addDef(Dst)
6591 .addUse(VData); // vdata
6592
6593 if (IsCmpSwap)
6594 MIB.addReg(CmpVal);
6595
6596 MIB.addUse(RSrc) // rsrc
6597 .addUse(VIndex) // vindex
6598 .addUse(VOffset) // voffset
6599 .addUse(SOffset) // soffset
6600 .addImm(ImmOffset) // offset(imm)
6601 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6602 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6603 .addMemOperand(MMO);
6604
6605 MI.eraseFromParent();
6606 return true;
6607}
6608
6609/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6610/// vector with s16 typed elements.
6612 SmallVectorImpl<Register> &PackedAddrs,
6613 unsigned ArgOffset,
6615 bool IsA16, bool IsG16) {
6616 const LLT S16 = LLT::scalar(16);
6617 const LLT V2S16 = LLT::fixed_vector(2, 16);
6618 auto EndIdx = Intr->VAddrEnd;
6619
6620 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6621 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6622 if (!SrcOp.isReg())
6623 continue; // _L to _LZ may have eliminated this.
6624
6625 Register AddrReg = SrcOp.getReg();
6626
6627 if ((I < Intr->GradientStart) ||
6628 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6629 (I >= Intr->CoordStart && !IsA16)) {
6630 if ((I < Intr->GradientStart) && IsA16 &&
6631 (B.getMRI()->getType(AddrReg) == S16)) {
6632 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6633 // Special handling of bias when A16 is on. Bias is of type half but
6634 // occupies full 32-bit.
6635 PackedAddrs.push_back(
6636 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6637 .getReg(0));
6638 } else {
6639 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6640 "Bias needs to be converted to 16 bit in A16 mode");
6641 // Handle any gradient or coordinate operands that should not be packed
6642 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6643 PackedAddrs.push_back(AddrReg);
6644 }
6645 } else {
6646 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6647 // derivatives dx/dh and dx/dv are packed with undef.
6648 if (((I + 1) >= EndIdx) ||
6649 ((Intr->NumGradients / 2) % 2 == 1 &&
6650 (I == static_cast<unsigned>(Intr->GradientStart +
6651 (Intr->NumGradients / 2) - 1) ||
6652 I == static_cast<unsigned>(Intr->GradientStart +
6653 Intr->NumGradients - 1))) ||
6654 // Check for _L to _LZ optimization
6655 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6656 PackedAddrs.push_back(
6657 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6658 .getReg(0));
6659 } else {
6660 PackedAddrs.push_back(
6661 B.buildBuildVector(
6662 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6663 .getReg(0));
6664 ++I;
6665 }
6666 }
6667 }
6668}
6669
6670/// Convert from separate vaddr components to a single vector address register,
6671/// and replace the remaining operands with $noreg.
6673 int DimIdx, int NumVAddrs) {
6674 const LLT S32 = LLT::scalar(32);
6675 (void)S32;
6676 SmallVector<Register, 8> AddrRegs;
6677 for (int I = 0; I != NumVAddrs; ++I) {
6678 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6679 if (SrcOp.isReg()) {
6680 AddrRegs.push_back(SrcOp.getReg());
6681 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6682 }
6683 }
6684
6685 int NumAddrRegs = AddrRegs.size();
6686 if (NumAddrRegs != 1) {
6687 auto VAddr =
6688 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6689 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6690 }
6691
6692 for (int I = 1; I != NumVAddrs; ++I) {
6693 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6694 if (SrcOp.isReg())
6695 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6696 }
6697}
6698
6699/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6700///
6701/// Depending on the subtarget, load/store with 16-bit element data need to be
6702/// rewritten to use the low half of 32-bit registers, or directly use a packed
6703/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6704/// registers.
6705///
6706/// We don't want to directly select image instructions just yet, but also want
6707/// to exposes all register repacking to the legalizer/combiners. We also don't
6708/// want a selected instruction entering RegBankSelect. In order to avoid
6709/// defining a multitude of intermediate image instructions, directly hack on
6710/// the intrinsic's arguments. In cases like a16 addresses, this requires
6711/// padding now unnecessary arguments with $noreg.
6714 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6715
6716 const MachineFunction &MF = *MI.getMF();
6717 const unsigned NumDefs = MI.getNumExplicitDefs();
6718 const unsigned ArgOffset = NumDefs + 1;
6719 bool IsTFE = NumDefs == 2;
6720 // We are only processing the operands of d16 image operations on subtargets
6721 // that use the unpacked register layout, or need to repack the TFE result.
6722
6723 // TODO: Do we need to guard against already legalized intrinsics?
6724 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6726
6727 MachineRegisterInfo *MRI = B.getMRI();
6728 const LLT S32 = LLT::scalar(32);
6729 const LLT S16 = LLT::scalar(16);
6730 const LLT V2S16 = LLT::fixed_vector(2, 16);
6731
6732 unsigned DMask = 0;
6733 Register VData;
6734 LLT Ty;
6735
6736 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6737 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6738 Ty = MRI->getType(VData);
6739 }
6740
6741 const bool IsAtomicPacked16Bit =
6742 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6743 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6744
6745 // Check for 16 bit addresses and pack if true.
6746 LLT GradTy =
6747 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6748 LLT AddrTy =
6749 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6750 const bool IsG16 =
6751 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6752 const bool IsA16 = AddrTy == S16;
6753 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6754
6755 int DMaskLanes = 0;
6756 if (!BaseOpcode->Atomic) {
6757 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6758 if (BaseOpcode->Gather4) {
6759 DMaskLanes = 4;
6760 } else if (DMask != 0) {
6761 DMaskLanes = llvm::popcount(DMask);
6762 } else if (!IsTFE && !BaseOpcode->Store) {
6763 // If dmask is 0, this is a no-op load. This can be eliminated.
6764 B.buildUndef(MI.getOperand(0));
6765 MI.eraseFromParent();
6766 return true;
6767 }
6768 }
6769
6770 Observer.changingInstr(MI);
6771 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6772
6773 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6774 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6775 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6776 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6777 unsigned NewOpcode = LoadOpcode;
6778 if (BaseOpcode->Store)
6779 NewOpcode = StoreOpcode;
6780 else if (BaseOpcode->NoReturn)
6781 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6782
6783 // Track that we legalized this
6784 MI.setDesc(B.getTII().get(NewOpcode));
6785
6786 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6787 // dmask to be at least 1 otherwise the instruction will fail
6788 if (IsTFE && DMask == 0) {
6789 DMask = 0x1;
6790 DMaskLanes = 1;
6791 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6792 }
6793
6794 if (BaseOpcode->Atomic) {
6795 Register VData0 = MI.getOperand(2).getReg();
6796 LLT Ty = MRI->getType(VData0);
6797
6798 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6799 if (Ty.isVector() && !IsAtomicPacked16Bit)
6800 return false;
6801
6802 if (BaseOpcode->AtomicX2) {
6803 Register VData1 = MI.getOperand(3).getReg();
6804 // The two values are packed in one register.
6805 LLT PackedTy = LLT::fixed_vector(2, Ty);
6806 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6807 MI.getOperand(2).setReg(Concat.getReg(0));
6808 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6809 }
6810 }
6811
6812 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6813
6814 // Rewrite the addressing register layout before doing anything else.
6815 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6816 // 16 bit gradients are supported, but are tied to the A16 control
6817 // so both gradients and addresses must be 16 bit
6818 return false;
6819 }
6820
6821 if (IsA16 && !ST.hasA16()) {
6822 // A16 not supported
6823 return false;
6824 }
6825
6826 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6827 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6828
6829 if (IsA16 || IsG16) {
6830 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6831 // instructions expect VGPR_32
6832 SmallVector<Register, 4> PackedRegs;
6833
6834 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6835
6836 // See also below in the non-a16 branch
6837 const bool UseNSA = ST.hasNSAEncoding() &&
6838 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6839 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6840 const bool UsePartialNSA =
6841 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6842
6843 if (UsePartialNSA) {
6844 // Pack registers that would go over NSAMaxSize into last VAddr register
6845 LLT PackedAddrTy =
6846 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6847 auto Concat = B.buildConcatVectors(
6848 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6849 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6850 PackedRegs.resize(NSAMaxSize);
6851 } else if (!UseNSA && PackedRegs.size() > 1) {
6852 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6853 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6854 PackedRegs[0] = Concat.getReg(0);
6855 PackedRegs.resize(1);
6856 }
6857
6858 const unsigned NumPacked = PackedRegs.size();
6859 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6860 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6861 if (!SrcOp.isReg()) {
6862 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6863 continue;
6864 }
6865
6866 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6867
6868 if (I - Intr->VAddrStart < NumPacked)
6869 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6870 else
6871 SrcOp.setReg(AMDGPU::NoRegister);
6872 }
6873 } else {
6874 // If the register allocator cannot place the address registers contiguously
6875 // without introducing moves, then using the non-sequential address encoding
6876 // is always preferable, since it saves VALU instructions and is usually a
6877 // wash in terms of code size or even better.
6878 //
6879 // However, we currently have no way of hinting to the register allocator
6880 // that MIMG addresses should be placed contiguously when it is possible to
6881 // do so, so force non-NSA for the common 2-address case as a heuristic.
6882 //
6883 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6884 // allocation when possible.
6885 //
6886 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6887 // set of the remaining addresses.
6888 const bool UseNSA = ST.hasNSAEncoding() &&
6889 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6890 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6891 const bool UsePartialNSA =
6892 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6893
6894 if (UsePartialNSA) {
6896 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6897 Intr->NumVAddrs - NSAMaxSize + 1);
6898 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6899 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6900 Intr->NumVAddrs);
6901 }
6902 }
6903
6904 int Flags = 0;
6905 if (IsA16)
6906 Flags |= 1;
6907 if (IsG16)
6908 Flags |= 2;
6909 MI.addOperand(MachineOperand::CreateImm(Flags));
6910
6911 if (BaseOpcode->NoReturn) { // No TFE for stores?
6912 // TODO: Handle dmask trim
6913 if (!Ty.isVector() || !IsD16)
6914 return true;
6915
6916 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6917 if (RepackedReg != VData) {
6918 MI.getOperand(1).setReg(RepackedReg);
6919 }
6920
6921 return true;
6922 }
6923
6924 Register DstReg = MI.getOperand(0).getReg();
6925 const LLT EltTy = Ty.getScalarType();
6926 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6927
6928 // Confirm that the return type is large enough for the dmask specified
6929 if (NumElts < DMaskLanes)
6930 return false;
6931
6932 if (NumElts > 4 || DMaskLanes > 4)
6933 return false;
6934
6935 // Image atomic instructions are using DMask to specify how many bits
6936 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6937 // DMaskLanes for image atomic has default value '0'.
6938 // We must be sure that atomic variants (especially packed) will not be
6939 // truncated from v2s16 or v4s16 to s16 type.
6940 //
6941 // ChangeElementCount will be needed for image load where Ty is always scalar.
6942 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6943 const LLT AdjustedTy =
6944 DMaskLanes == 0
6945 ? Ty
6946 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6947
6948 // The raw dword aligned data component of the load. The only legal cases
6949 // where this matters should be when using the packed D16 format, for
6950 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6951 LLT RoundedTy;
6952
6953 // S32 vector to cover all data, plus TFE result element.
6954 LLT TFETy;
6955
6956 // Register type to use for each loaded component. Will be S32 or V2S16.
6957 LLT RegTy;
6958
6959 if (IsD16 && ST.hasUnpackedD16VMem()) {
6960 RoundedTy =
6961 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6962 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6963 RegTy = S32;
6964 } else {
6965 unsigned EltSize = EltTy.getSizeInBits();
6966 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6967 unsigned RoundedSize = 32 * RoundedElts;
6968 RoundedTy = LLT::scalarOrVector(
6969 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6970 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6971 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6972 }
6973
6974 // The return type does not need adjustment.
6975 // TODO: Should we change s16 case to s32 or <2 x s16>?
6976 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6977 return true;
6978
6979 Register Dst1Reg;
6980
6981 // Insert after the instruction.
6982 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6983
6984 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6985 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6986 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6987 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6988
6989 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6990
6991 MI.getOperand(0).setReg(NewResultReg);
6992
6993 // In the IR, TFE is supposed to be used with a 2 element struct return
6994 // type. The instruction really returns these two values in one contiguous
6995 // register, with one additional dword beyond the loaded data. Rewrite the
6996 // return type to use a single register result.
6997
6998 if (IsTFE) {
6999 Dst1Reg = MI.getOperand(1).getReg();
7000 if (MRI->getType(Dst1Reg) != S32)
7001 return false;
7002
7003 // TODO: Make sure the TFE operand bit is set.
7004 MI.removeOperand(1);
7005
7006 // Handle the easy case that requires no repack instructions.
7007 if (Ty == S32) {
7008 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7009 return true;
7010 }
7011 }
7012
7013 // Now figure out how to copy the new result register back into the old
7014 // result.
7015 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7016
7017 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7018
7019 if (ResultNumRegs == 1) {
7020 assert(!IsTFE);
7021 ResultRegs[0] = NewResultReg;
7022 } else {
7023 // We have to repack into a new vector of some kind.
7024 for (int I = 0; I != NumDataRegs; ++I)
7025 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7026 B.buildUnmerge(ResultRegs, NewResultReg);
7027
7028 // Drop the final TFE element to get the data part. The TFE result is
7029 // directly written to the right place already.
7030 if (IsTFE)
7031 ResultRegs.resize(NumDataRegs);
7032 }
7033
7034 // For an s16 scalar result, we form an s32 result with a truncate regardless
7035 // of packed vs. unpacked.
7036 if (IsD16 && !Ty.isVector()) {
7037 B.buildTrunc(DstReg, ResultRegs[0]);
7038 return true;
7039 }
7040
7041 // Avoid a build/concat_vector of 1 entry.
7042 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7043 B.buildBitcast(DstReg, ResultRegs[0]);
7044 return true;
7045 }
7046
7047 assert(Ty.isVector());
7048
7049 if (IsD16) {
7050 // For packed D16 results with TFE enabled, all the data components are
7051 // S32. Cast back to the expected type.
7052 //
7053 // TODO: We don't really need to use load s32 elements. We would only need one
7054 // cast for the TFE result if a multiple of v2s16 was used.
7055 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7056 for (Register &Reg : ResultRegs)
7057 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7058 } else if (ST.hasUnpackedD16VMem()) {
7059 for (Register &Reg : ResultRegs)
7060 Reg = B.buildTrunc(S16, Reg).getReg(0);
7061 }
7062 }
7063
7064 auto padWithUndef = [&](LLT Ty, int NumElts) {
7065 if (NumElts == 0)
7066 return;
7067 Register Undef = B.buildUndef(Ty).getReg(0);
7068 for (int I = 0; I != NumElts; ++I)
7069 ResultRegs.push_back(Undef);
7070 };
7071
7072 // Pad out any elements eliminated due to the dmask.
7073 LLT ResTy = MRI->getType(ResultRegs[0]);
7074 if (!ResTy.isVector()) {
7075 padWithUndef(ResTy, NumElts - ResultRegs.size());
7076 B.buildBuildVector(DstReg, ResultRegs);
7077 return true;
7078 }
7079
7080 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7081 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7082
7083 // Deal with the one annoying legal case.
7084 const LLT V3S16 = LLT::fixed_vector(3, 16);
7085 if (Ty == V3S16) {
7086 if (IsTFE) {
7087 if (ResultRegs.size() == 1) {
7088 NewResultReg = ResultRegs[0];
7089 } else if (ResultRegs.size() == 2) {
7090 LLT V4S16 = LLT::fixed_vector(4, 16);
7091 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7092 } else {
7093 return false;
7094 }
7095 }
7096
7097 if (MRI->getType(DstReg).getNumElements() <
7098 MRI->getType(NewResultReg).getNumElements()) {
7099 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7100 } else {
7101 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7102 }
7103 return true;
7104 }
7105
7106 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7107 B.buildConcatVectors(DstReg, ResultRegs);
7108 return true;
7109}
7110
7112 MachineInstr &MI) const {
7113 MachineIRBuilder &B = Helper.MIRBuilder;
7114 GISelChangeObserver &Observer = Helper.Observer;
7115
7116 Register OrigDst = MI.getOperand(0).getReg();
7117 Register Dst;
7118 LLT Ty = B.getMRI()->getType(OrigDst);
7119 unsigned Size = Ty.getSizeInBits();
7120 MachineFunction &MF = B.getMF();
7121 unsigned Opc = 0;
7122 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7123 assert(Size == 8 || Size == 16);
7124 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7125 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7126 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7127 // destination register.
7128 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7129 } else {
7130 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7131 Dst = OrigDst;
7132 }
7133
7134 Observer.changingInstr(MI);
7135
7136 // Handle needing to s.buffer.load() a p8 value.
7137 if (hasBufferRsrcWorkaround(Ty)) {
7138 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7139 B.setInsertPt(B.getMBB(), MI);
7140 }
7142 Ty = getBitcastRegisterType(Ty);
7143 Helper.bitcastDst(MI, Ty, 0);
7144 B.setInsertPt(B.getMBB(), MI);
7145 }
7146
7147 // FIXME: We don't really need this intermediate instruction. The intrinsic
7148 // should be fixed to have a memory operand. Since it's readnone, we're not
7149 // allowed to add one.
7150 MI.setDesc(B.getTII().get(Opc));
7151 MI.removeOperand(1); // Remove intrinsic ID
7152
7153 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7154 const unsigned MemSize = (Size + 7) / 8;
7155 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7161 MemSize, MemAlign);
7162 MI.addMemOperand(MF, MMO);
7163 if (Dst != OrigDst) {
7164 MI.getOperand(0).setReg(Dst);
7165 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7166 B.buildTrunc(OrigDst, Dst);
7167 }
7168
7169 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7170 // always be legal. We may need to restore this to a 96-bit result if it turns
7171 // out this needs to be converted to a vector load during RegBankSelect.
7172 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7173 if (Ty.isVector())
7175 else
7176 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7177 }
7178
7179 Observer.changedInstr(MI);
7180 return true;
7181}
7182
7184 MachineInstr &MI) const {
7185 MachineIRBuilder &B = Helper.MIRBuilder;
7186 GISelChangeObserver &Observer = Helper.Observer;
7187 Observer.changingInstr(MI);
7188 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7189 MI.removeOperand(0); // Remove intrinsic ID
7191 Observer.changedInstr(MI);
7192 return true;
7193}
7194
7195// TODO: Move to selection
7198 MachineIRBuilder &B) const {
7199 if (!ST.isTrapHandlerEnabled() ||
7200 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7201 return legalizeTrapEndpgm(MI, MRI, B);
7202
7203 return ST.supportsGetDoorbellID() ?
7205}
7206
7209 const DebugLoc &DL = MI.getDebugLoc();
7210 MachineBasicBlock &BB = B.getMBB();
7211 MachineFunction *MF = BB.getParent();
7212
7213 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7214 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7215 .addImm(0);
7216 MI.eraseFromParent();
7217 return true;
7218 }
7219
7220 // We need a block split to make the real endpgm a terminator. We also don't
7221 // want to break phis in successor blocks, so we can't just delete to the
7222 // end of the block.
7223 BB.splitAt(MI, false /*UpdateLiveIns*/);
7225 MF->push_back(TrapBB);
7226 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7227 .addImm(0);
7228 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7229 .addMBB(TrapBB);
7230
7231 BB.addSuccessor(TrapBB);
7232 MI.eraseFromParent();
7233 return true;
7234}
7235
7238 MachineFunction &MF = B.getMF();
7239 const LLT S64 = LLT::scalar(64);
7240
7241 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7242 // For code object version 5, queue_ptr is passed through implicit kernarg.
7248 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7249
7250 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7252
7253 if (!loadInputValue(KernargPtrReg, B,
7255 return false;
7256
7257 // TODO: can we be smarter about machine pointer info?
7260 PtrInfo,
7264
7265 // Pointer address
7266 Register LoadAddr = MRI.createGenericVirtualRegister(
7268 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7269 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7270 // Load address
7271 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7272 B.buildCopy(SGPR01, Temp);
7273 B.buildInstr(AMDGPU::S_TRAP)
7274 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7275 .addReg(SGPR01, RegState::Implicit);
7276 MI.eraseFromParent();
7277 return true;
7278 }
7279
7280 // Pass queue pointer to trap handler as input, and insert trap instruction
7281 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7282 Register LiveIn =
7283 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
7285 return false;
7286
7287 B.buildCopy(SGPR01, LiveIn);
7288 B.buildInstr(AMDGPU::S_TRAP)
7289 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7290 .addReg(SGPR01, RegState::Implicit);
7291
7292 MI.eraseFromParent();
7293 return true;
7294}
7295
7298 MachineIRBuilder &B) const {
7299 // We need to simulate the 's_trap 2' instruction on targets that run in
7300 // PRIV=1 (where it is treated as a nop).
7301 if (ST.hasPrivEnabledTrap2NopBug()) {
7302 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7303 MI.getDebugLoc());
7304 MI.eraseFromParent();
7305 return true;
7306 }
7307
7308 B.buildInstr(AMDGPU::S_TRAP)
7309 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7310 MI.eraseFromParent();
7311 return true;
7312}
7313
7316 MachineIRBuilder &B) const {
7317 // Is non-HSA path or trap-handler disabled? Then, report a warning
7318 // accordingly
7319 if (!ST.isTrapHandlerEnabled() ||
7320 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7321 Function &Fn = B.getMF().getFunction();
7323 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7324 } else {
7325 // Insert debug-trap instruction
7326 B.buildInstr(AMDGPU::S_TRAP)
7327 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7328 }
7329
7330 MI.eraseFromParent();
7331 return true;
7332}
7333
7335 MachineInstr &MI, MachineIRBuilder &B) const {
7336 MachineRegisterInfo &MRI = *B.getMRI();
7337 const LLT S16 = LLT::scalar(16);
7338 const LLT S32 = LLT::scalar(32);
7339 const LLT V2S16 = LLT::fixed_vector(2, 16);
7340 const LLT V3S32 = LLT::fixed_vector(3, 32);
7341
7342 Register DstReg = MI.getOperand(0).getReg();
7343 Register NodePtr = MI.getOperand(2).getReg();
7344 Register RayExtent = MI.getOperand(3).getReg();
7345 Register RayOrigin = MI.getOperand(4).getReg();
7346 Register RayDir = MI.getOperand(5).getReg();
7347 Register RayInvDir = MI.getOperand(6).getReg();
7348 Register TDescr = MI.getOperand(7).getReg();
7349
7350 if (!ST.hasGFX10_AEncoding()) {
7351 Function &Fn = B.getMF().getFunction();
7353 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7354 return false;
7355 }
7356
7357 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7358 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7359 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7360 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7361 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7362 const unsigned NumVDataDwords = 4;
7363 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7364 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7365 const bool UseNSA =
7366 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7367
7368 const unsigned BaseOpcodes[2][2] = {
7369 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7370 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7371 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7372 int Opcode;
7373 if (UseNSA) {
7374 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7375 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7376 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7377 : AMDGPU::MIMGEncGfx10NSA,
7378 NumVDataDwords, NumVAddrDwords);
7379 } else {
7380 assert(!IsGFX12Plus);
7381 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7382 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7383 : AMDGPU::MIMGEncGfx10Default,
7384 NumVDataDwords, NumVAddrDwords);
7385 }
7386 assert(Opcode != -1);
7387
7389 if (UseNSA && IsGFX11Plus) {
7390 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7391 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7392 auto Merged = B.buildMergeLikeInstr(
7393 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7394 Ops.push_back(Merged.getReg(0));
7395 };
7396
7397 Ops.push_back(NodePtr);
7398 Ops.push_back(RayExtent);
7399 packLanes(RayOrigin);
7400
7401 if (IsA16) {
7402 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7403 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7404 auto MergedDir = B.buildMergeLikeInstr(
7405 V3S32,
7406 {B.buildBitcast(
7407 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7408 UnmergeRayDir.getReg(0)}))
7409 .getReg(0),
7410 B.buildBitcast(
7411 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7412 UnmergeRayDir.getReg(1)}))
7413 .getReg(0),
7414 B.buildBitcast(
7415 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7416 UnmergeRayDir.getReg(2)}))
7417 .getReg(0)});
7418 Ops.push_back(MergedDir.getReg(0));
7419 } else {
7420 packLanes(RayDir);
7421 packLanes(RayInvDir);
7422 }
7423 } else {
7424 if (Is64) {
7425 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7426 Ops.push_back(Unmerge.getReg(0));
7427 Ops.push_back(Unmerge.getReg(1));
7428 } else {
7429 Ops.push_back(NodePtr);
7430 }
7431 Ops.push_back(RayExtent);
7432
7433 auto packLanes = [&Ops, &S32, &B](Register Src) {
7434 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7435 Ops.push_back(Unmerge.getReg(0));
7436 Ops.push_back(Unmerge.getReg(1));
7437 Ops.push_back(Unmerge.getReg(2));
7438 };
7439
7440 packLanes(RayOrigin);
7441 if (IsA16) {
7442 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7443 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7444 Register R1 = MRI.createGenericVirtualRegister(S32);
7445 Register R2 = MRI.createGenericVirtualRegister(S32);
7446 Register R3 = MRI.createGenericVirtualRegister(S32);
7447 B.buildMergeLikeInstr(R1,
7448 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7449 B.buildMergeLikeInstr(
7450 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7451 B.buildMergeLikeInstr(
7452 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7453 Ops.push_back(R1);
7454 Ops.push_back(R2);
7455 Ops.push_back(R3);
7456 } else {
7457 packLanes(RayDir);
7458 packLanes(RayInvDir);
7459 }
7460 }
7461
7462 if (!UseNSA) {
7463 // Build a single vector containing all the operands so far prepared.
7464 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7465 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7466 Ops.clear();
7467 Ops.push_back(MergedOps);
7468 }
7469
7470 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7471 .addDef(DstReg)
7472 .addImm(Opcode);
7473
7474 for (Register R : Ops) {
7475 MIB.addUse(R);
7476 }
7477
7478 MIB.addUse(TDescr)
7479 .addImm(IsA16 ? 1 : 0)
7480 .cloneMemRefs(MI);
7481
7482 MI.eraseFromParent();
7483 return true;
7484}
7485
7487 MachineInstr &MI, MachineIRBuilder &B) const {
7488 const LLT S32 = LLT::scalar(32);
7489 const LLT V2S32 = LLT::fixed_vector(2, 32);
7490
7491 Register DstReg = MI.getOperand(0).getReg();
7492 Register DstOrigin = MI.getOperand(1).getReg();
7493 Register DstDir = MI.getOperand(2).getReg();
7494 Register NodePtr = MI.getOperand(4).getReg();
7495 Register RayExtent = MI.getOperand(5).getReg();
7496 Register InstanceMask = MI.getOperand(6).getReg();
7497 Register RayOrigin = MI.getOperand(7).getReg();
7498 Register RayDir = MI.getOperand(8).getReg();
7499 Register Offsets = MI.getOperand(9).getReg();
7500 Register TDescr = MI.getOperand(10).getReg();
7501
7502 if (!ST.hasBVHDualAndBVH8Insts()) {
7503 Function &Fn = B.getMF().getFunction();
7505 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7506 return false;
7507 }
7508
7509 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7510 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7511 const unsigned NumVDataDwords = 10;
7512 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7513 int Opcode = AMDGPU::getMIMGOpcode(
7514 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7515 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7516 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7517 assert(Opcode != -1);
7518
7519 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7520 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7521
7522 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7523 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7524 .addDef(DstReg)
7525 .addDef(DstOrigin)
7526 .addDef(DstDir)
7527 .addImm(Opcode)
7528 .addUse(NodePtr)
7529 .addUse(RayExtentInstanceMaskVec.getReg(0))
7530 .addUse(RayOrigin)
7531 .addUse(RayDir)
7532 .addUse(Offsets)
7533 .addUse(TDescr)
7534 .cloneMemRefs(MI);
7535
7536 MI.eraseFromParent();
7537 return true;
7538}
7539
7541 MachineIRBuilder &B) const {
7542 const SITargetLowering *TLI = ST.getTargetLowering();
7544 Register DstReg = MI.getOperand(0).getReg();
7545 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7546 MI.eraseFromParent();
7547 return true;
7548}
7549
7551 MachineIRBuilder &B) const {
7552 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7553 if (!ST.hasArchitectedSGPRs())
7554 return false;
7555 LLT S32 = LLT::scalar(32);
7556 Register DstReg = MI.getOperand(0).getReg();
7557 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7558 auto LSB = B.buildConstant(S32, 25);
7559 auto Width = B.buildConstant(S32, 5);
7560 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7561 MI.eraseFromParent();
7562 return true;
7563}
7564
7567 AMDGPU::Hwreg::Id HwReg,
7568 unsigned LowBit,
7569 unsigned Width) const {
7570 MachineRegisterInfo &MRI = *B.getMRI();
7571 Register DstReg = MI.getOperand(0).getReg();
7572 if (!MRI.getRegClassOrNull(DstReg))
7573 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7574 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7575 .addDef(DstReg)
7576 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
7577 MI.eraseFromParent();
7578 return true;
7579}
7580
7581static constexpr unsigned FPEnvModeBitField =
7583
7584static constexpr unsigned FPEnvTrapBitField =
7586
7589 MachineIRBuilder &B) const {
7590 Register Src = MI.getOperand(0).getReg();
7591 if (MRI.getType(Src) != S64)
7592 return false;
7593
7594 auto ModeReg =
7595 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7596 /*HasSideEffects=*/true, /*isConvergent=*/false)
7597 .addImm(FPEnvModeBitField);
7598 auto TrapReg =
7599 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7600 /*HasSideEffects=*/true, /*isConvergent=*/false)
7601 .addImm(FPEnvTrapBitField);
7602 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7603 MI.eraseFromParent();
7604 return true;
7605}
7606
7609 MachineIRBuilder &B) const {
7610 Register Src = MI.getOperand(0).getReg();
7611 if (MRI.getType(Src) != S64)
7612 return false;
7613
7614 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7615 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7616 /*HasSideEffects=*/true, /*isConvergent=*/false)
7617 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7618 .addReg(Unmerge.getReg(0));
7619 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7620 /*HasSideEffects=*/true, /*isConvergent=*/false)
7621 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7622 .addReg(Unmerge.getReg(1));
7623 MI.eraseFromParent();
7624 return true;
7625}
7626
7628 MachineInstr &MI) const {
7629 MachineIRBuilder &B = Helper.MIRBuilder;
7630 MachineRegisterInfo &MRI = *B.getMRI();
7631
7632 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7633 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7634 switch (IntrID) {
7635 case Intrinsic::amdgcn_if:
7636 case Intrinsic::amdgcn_else: {
7637 MachineInstr *Br = nullptr;
7638 MachineBasicBlock *UncondBrTarget = nullptr;
7639 bool Negated = false;
7640 if (MachineInstr *BrCond =
7641 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7642 const SIRegisterInfo *TRI
7643 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7644
7645 Register Def = MI.getOperand(1).getReg();
7646 Register Use = MI.getOperand(3).getReg();
7647
7648 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7649
7650 if (Negated)
7651 std::swap(CondBrTarget, UncondBrTarget);
7652
7653 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7654 if (IntrID == Intrinsic::amdgcn_if) {
7655 B.buildInstr(AMDGPU::SI_IF)
7656 .addDef(Def)
7657 .addUse(Use)
7658 .addMBB(UncondBrTarget);
7659 } else {
7660 B.buildInstr(AMDGPU::SI_ELSE)
7661 .addDef(Def)
7662 .addUse(Use)
7663 .addMBB(UncondBrTarget);
7664 }
7665
7666 if (Br) {
7667 Br->getOperand(0).setMBB(CondBrTarget);
7668 } else {
7669 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7670 // since we're swapping branch targets it needs to be reinserted.
7671 // FIXME: IRTranslator should probably not do this
7672 B.buildBr(*CondBrTarget);
7673 }
7674
7675 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7676 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7677 MI.eraseFromParent();
7678 BrCond->eraseFromParent();
7679 return true;
7680 }
7681
7682 return false;
7683 }
7684 case Intrinsic::amdgcn_loop: {
7685 MachineInstr *Br = nullptr;
7686 MachineBasicBlock *UncondBrTarget = nullptr;
7687 bool Negated = false;
7688 if (MachineInstr *BrCond =
7689 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7690 const SIRegisterInfo *TRI
7691 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7692
7693 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7694 Register Reg = MI.getOperand(2).getReg();
7695
7696 if (Negated)
7697 std::swap(CondBrTarget, UncondBrTarget);
7698
7699 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7700 B.buildInstr(AMDGPU::SI_LOOP)
7701 .addUse(Reg)
7702 .addMBB(UncondBrTarget);
7703
7704 if (Br)
7705 Br->getOperand(0).setMBB(CondBrTarget);
7706 else
7707 B.buildBr(*CondBrTarget);
7708
7709 MI.eraseFromParent();
7710 BrCond->eraseFromParent();
7711 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7712 return true;
7713 }
7714
7715 return false;
7716 }
7717 case Intrinsic::amdgcn_addrspacecast_nonnull:
7718 return legalizeAddrSpaceCast(MI, MRI, B);
7719 case Intrinsic::amdgcn_make_buffer_rsrc:
7721 case Intrinsic::amdgcn_kernarg_segment_ptr:
7722 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7723 // This only makes sense to call in a kernel, so just lower to null.
7724 B.buildConstant(MI.getOperand(0).getReg(), 0);
7725 MI.eraseFromParent();
7726 return true;
7727 }
7728
7731 case Intrinsic::amdgcn_implicitarg_ptr:
7732 return legalizeImplicitArgPtr(MI, MRI, B);
7733 case Intrinsic::amdgcn_workitem_id_x:
7734 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7736 case Intrinsic::amdgcn_workitem_id_y:
7737 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7739 case Intrinsic::amdgcn_workitem_id_z:
7740 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7742 case Intrinsic::amdgcn_workgroup_id_x:
7743 return legalizeWorkGroupId(
7747 case Intrinsic::amdgcn_workgroup_id_y:
7748 return legalizeWorkGroupId(
7752 case Intrinsic::amdgcn_workgroup_id_z:
7753 return legalizeWorkGroupId(
7757 case Intrinsic::amdgcn_cluster_id_x:
7758 return ST.hasClusters() &&
7761 case Intrinsic::amdgcn_cluster_id_y:
7762 return ST.hasClusters() &&
7765 case Intrinsic::amdgcn_cluster_id_z:
7766 return ST.hasClusters() &&
7769 case Intrinsic::amdgcn_cluster_workgroup_id_x:
7770 return ST.hasClusters() &&
7773 case Intrinsic::amdgcn_cluster_workgroup_id_y:
7774 return ST.hasClusters() &&
7777 case Intrinsic::amdgcn_cluster_workgroup_id_z:
7778 return ST.hasClusters() &&
7781 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7782 return ST.hasClusters() &&
7784 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7785 return ST.hasClusters() &&
7788 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7789 return ST.hasClusters() &&
7792 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7793 return ST.hasClusters() &&
7796 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7797 return ST.hasClusters() &&
7799 MI, MRI, B,
7801 case Intrinsic::amdgcn_wave_id:
7802 return legalizeWaveID(MI, B);
7803 case Intrinsic::amdgcn_lds_kernel_id:
7806 case Intrinsic::amdgcn_dispatch_ptr:
7809 case Intrinsic::amdgcn_queue_ptr:
7812 case Intrinsic::amdgcn_implicit_buffer_ptr:
7815 case Intrinsic::amdgcn_dispatch_id:
7818 case Intrinsic::r600_read_ngroups_x:
7819 // TODO: Emit error for hsa
7822 case Intrinsic::r600_read_ngroups_y:
7825 case Intrinsic::r600_read_ngroups_z:
7828 case Intrinsic::r600_read_local_size_x:
7829 // TODO: Could insert G_ASSERT_ZEXT from s16
7831 case Intrinsic::r600_read_local_size_y:
7832 // TODO: Could insert G_ASSERT_ZEXT from s16
7834 // TODO: Could insert G_ASSERT_ZEXT from s16
7835 case Intrinsic::r600_read_local_size_z:
7838 case Intrinsic::amdgcn_fdiv_fast:
7839 return legalizeFDIVFastIntrin(MI, MRI, B);
7840 case Intrinsic::amdgcn_is_shared:
7842 case Intrinsic::amdgcn_is_private:
7844 case Intrinsic::amdgcn_wavefrontsize: {
7845 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7846 MI.eraseFromParent();
7847 return true;
7848 }
7849 case Intrinsic::amdgcn_s_buffer_load:
7850 return legalizeSBufferLoad(Helper, MI);
7851 case Intrinsic::amdgcn_raw_buffer_store:
7852 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7853 case Intrinsic::amdgcn_struct_buffer_store:
7854 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7855 return legalizeBufferStore(MI, Helper, false, false);
7856 case Intrinsic::amdgcn_raw_buffer_store_format:
7857 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7858 case Intrinsic::amdgcn_struct_buffer_store_format:
7859 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7860 return legalizeBufferStore(MI, Helper, false, true);
7861 case Intrinsic::amdgcn_raw_tbuffer_store:
7862 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7863 case Intrinsic::amdgcn_struct_tbuffer_store:
7864 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7865 return legalizeBufferStore(MI, Helper, true, true);
7866 case Intrinsic::amdgcn_raw_buffer_load:
7867 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7868 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7869 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7870 case Intrinsic::amdgcn_struct_buffer_load:
7871 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7872 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7873 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7874 return legalizeBufferLoad(MI, Helper, false, false);
7875 case Intrinsic::amdgcn_raw_buffer_load_format:
7876 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7877 case Intrinsic::amdgcn_struct_buffer_load_format:
7878 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7879 return legalizeBufferLoad(MI, Helper, true, false);
7880 case Intrinsic::amdgcn_raw_tbuffer_load:
7881 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7882 case Intrinsic::amdgcn_struct_tbuffer_load:
7883 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7884 return legalizeBufferLoad(MI, Helper, true, true);
7885 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7886 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7887 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7888 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7889 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7890 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7891 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7892 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7893 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7895 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7896 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7897 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7898 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7899 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7900 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7901 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7903 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7905 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7907 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7908 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7909 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7910 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7911 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7912 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7913 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7914 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7915 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7916 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7917 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7918 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7919 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7920 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7921 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7923 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7925 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7926 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7927 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7928 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7929 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7930 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7931 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7932 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7933 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7934 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7935 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7936 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7937 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7938 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7939 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7940 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7941 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7943 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7945 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7946 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7947 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7949 return legalizeBufferAtomic(MI, B, IntrID);
7950 case Intrinsic::amdgcn_rsq_clamp:
7952 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7954 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
7955 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
7957 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
7958 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
7959 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
7960 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
7961 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
7962 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
7963 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
7964 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
7965 Register Index = MI.getOperand(5).getReg();
7966 LLT S64 = LLT::scalar(64);
7967 if (MRI.getType(Index) != S64)
7968 MI.getOperand(5).setReg(B.buildAnyExt(S64, Index).getReg(0));
7969 return true;
7970 }
7971 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7972 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7973 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7974 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7975 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7976 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7977 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7978 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7979 Register Index = MI.getOperand(5).getReg();
7980 LLT S32 = LLT::scalar(32);
7981 if (MRI.getType(Index) != S32)
7982 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7983 return true;
7984 }
7985 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
7986 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
7987 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
7988 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
7989 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
7990 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
7991 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7992 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7993 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7994 Register Index = MI.getOperand(7).getReg();
7995 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
7996 ? LLT::scalar(64)
7997 : LLT::scalar(32);
7998 if (MRI.getType(Index) != IdxTy)
7999 MI.getOperand(7).setReg(B.buildAnyExt(IdxTy, Index).getReg(0));
8000 return true;
8001 }
8002
8003 case Intrinsic::amdgcn_fmed3: {
8004 GISelChangeObserver &Observer = Helper.Observer;
8005
8006 // FIXME: This is to workaround the inability of tablegen match combiners to
8007 // match intrinsics in patterns.
8008 Observer.changingInstr(MI);
8009 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8010 MI.removeOperand(1);
8011 Observer.changedInstr(MI);
8012 return true;
8013 }
8014 case Intrinsic::amdgcn_readlane:
8015 case Intrinsic::amdgcn_writelane:
8016 case Intrinsic::amdgcn_readfirstlane:
8017 case Intrinsic::amdgcn_permlane16:
8018 case Intrinsic::amdgcn_permlanex16:
8019 case Intrinsic::amdgcn_permlane64:
8020 case Intrinsic::amdgcn_set_inactive:
8021 case Intrinsic::amdgcn_set_inactive_chain_arg:
8022 case Intrinsic::amdgcn_mov_dpp8:
8023 case Intrinsic::amdgcn_update_dpp:
8024 return legalizeLaneOp(Helper, MI, IntrID);
8025 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8026 return legalizeSBufferPrefetch(Helper, MI);
8027 case Intrinsic::amdgcn_dead: {
8028 // TODO: Use poison instead of undef
8029 for (const MachineOperand &Def : MI.defs())
8030 B.buildUndef(Def);
8031 MI.eraseFromParent();
8032 return true;
8033 }
8034 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8035 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8036 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8037 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8038 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8039 MI.eraseFromParent();
8040 return true;
8041 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8042 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8043 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8044 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8045 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8046 MI.eraseFromParent();
8047 return true;
8048 default: {
8049 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8051 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8052 return true;
8053 }
8054 }
8055
8056 return true;
8057}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:71
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1254
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
const std::array< unsigned, 3 > & getDims() const
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1158
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:702
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:686
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:687
@ ICMP_NE
not equal
Definition InstrTypes.h:700
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:169
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:64
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:392
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
TargetOptions Options
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX1250(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
constexpr double ln2
Definition MathExtras.h:49
constexpr double ln10
Definition MathExtras.h:50
constexpr float log2ef
Definition MathExtras.h:66
constexpr double log2e
Definition MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:916
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:355
@ Offset
Definition DWP.cpp:477
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:2033
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:651
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1720
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:384
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
static constexpr uint64_t encode(Fields... Values)
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.