70#include "llvm/IR/IntrinsicsAMDGPU.h"
76#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
79class AMDGPUImageIntrinsicOptimizer :
public FunctionPass {
85 AMDGPUImageIntrinsicOptimizer(
const TargetMachine *TM =
nullptr)
94 "AMDGPU Image Intrinsic Optimizer",
false,
false)
96char AMDGPUImageIntrinsicOptimizer::
ID = 0;
101 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
104 if (IIList.front()->getIntrinsicID() !=
II->getIntrinsicID())
108 if (IIList.front()->getType() !=
II->getType())
112 bool AllEqual =
true;
113 assert(IIList.front()->arg_size() ==
II->arg_size());
114 for (
int I = 1,
E =
II->arg_size(); AllEqual &&
I !=
E; ++
I) {
115 Value *ArgList = IIList.front()->getArgOperand(
I);
117 if (
I == ImageDimIntr->VAddrEnd - 1) {
121 AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
124 AllEqual = ArgList == Arg;
131 IIList.emplace_back(
II);
136 MergeableInsts.emplace_back(1,
II);
145 for (;
I !=
E; ++
I) {
148 if (
I->mayHaveSideEffects()) {
158 if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
159 IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
164 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
180 for (
const auto &IIList : MergeableInsts) {
181 if (IIList.size() <= 1)
190 Function *
F = IIList.front()->getCalledFunction();
200 OverloadTys[0] = NewTy;
204 IIList.front()->getArgOperand(ImageDimIntr->
DMaskIndex));
206 unsigned NumElts =
popcount(DMaskVal);
210 unsigned NumLoads = IIList.size();
211 unsigned NumMsaas = NumElts;
212 unsigned NumVAddrLoads = 3 * NumLoads;
213 unsigned NumVDataLoads =
divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
214 unsigned NumVAddrMsaas = 3 * NumMsaas;
215 unsigned NumVDataMsaas =
divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
217 if (NumLoads < NumMsaas ||
218 (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
224 const APInt &NewFragIdVal = FragId->getValue().
udiv(4) * 4;
231 while (DMaskVal != 0) {
235 if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
236 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
238 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
241 ConstantInt::get(DMask->
getType(), NewMaskVal);
242 Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
243 CallInst *NewCall =
B.CreateIntrinsic(NewIntrinID, OverloadTys, Args);
247 DMaskVal -= NewMaskVal;
251 for (
auto &
II : IIList) {
252 Value *VecOp =
nullptr;
254 B.SetCurrentDebugLocation(
II->getDebugLoc());
256 VecOp =
B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
260 for (
unsigned I = 0;
I < NumElts; ++
I) {
261 VecOp =
B.CreateInsertElement(
263 B.CreateExtractElement(NewCalls[
I], Idx->getValue().urem(4)),
I);
269 II->replaceAllUsesWith(VecOp);
277 for (
auto *
I : InstrsToErase)
278 I->eraseFromParent();
296 return !
F.users().empty() &&
297 (
F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
298 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
317bool AMDGPUImageIntrinsicOptimizer::runOnFunction(
Function &
F) {
326 return new AMDGPUImageIntrinsicOptimizer(TM);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
bool optimizeSection(ArrayRef< SmallVector< IntrinsicInst *, 4 > > MergeableInsts)
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Class for arbitrary precision integers.
LLVM_ABI APInt udiv(const APInt &RHS) const
Unsigned division operation.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
InstListType::iterator iterator
Instruction iterators...
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
A wrapper class for inspecting calls to intrinsic functions.
A Module instance is used to store all the information related to an LLVM module.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
The instances of the Type class are immutable: once they are created, they are never changed.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
bool isGFX11Plus(const MCSubtargetInfo &STI)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
LLVM_ABI bool getIntrinsicSignature(Intrinsic::ID, FunctionType *FT, SmallVectorImpl< Type * > &ArgTys)
Gets the type arguments of an intrinsic call by matching type contraints specified by the ....
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
FunctionPass * createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *)
int popcount(T Value) noexcept
Count the number of set bits in a value.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)