-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[AMDGPU][GlobalISel] Enable kernel argument preloading #134655
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
tgymnich
commented
Apr 7, 2025
- enable kernel argument preloading
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Tim Gymnich (tgymnich) Changes
Full diff: https://github.com/llvm/llvm-project/pull/134655.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index a15f193549936..48c65f37d9ef8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -20,6 +20,7 @@
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -507,6 +508,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
const DataLayout &DL = F.getDataLayout();
+ const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -520,7 +522,6 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
// TODO: Align down to dword alignment and extract bits for extending loads.
for (auto &Arg : F.args()) {
- // TODO: Add support for kernarg preload.
if (Arg.hasAttribute("amdgpu-hidden-argument")) {
LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
return false;
@@ -545,11 +546,47 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
- if (IsByRef) {
+ if (Arg.hasInRegAttr() && IsEntryFunc && Subtarget->hasKernargPreload()) {
+ unsigned NumAllocSGPRs = alignTo(DL.getTypeSizeInBits(ArgTy), 32) / 32;
+
+ unsigned Padding = ArgOffset - BaseOffset;
+ unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
+
+ // Preload this argument.
+ const TargetRegisterClass *RC =
+ TRI->getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+ LLT RegTy = getLLTForType(*ArgTy, DL);
+ SmallVectorImpl<MCRegister> *PreloadRegs =
+ Info->addPreloadedKernArg(*TRI, RC, NumAllocSGPRs, i, PaddingSGPRs);
+
+ SmallVector<Register> SrcRegs;
+
+ if (PreloadRegs->size() > 1) {
+ RC = &AMDGPU::SGPR_32RegClass;
+ RegTy = LLT::scalar(32);
+ }
+
+ for (auto &PhysReg : *PreloadRegs) {
+ assert(PhysReg);
+ Register VReg = MF.addLiveIn(PhysReg, RC);
+ B.getMBB().addLiveIn(PhysReg);
+ MRI.setType(VReg, RegTy);
+ B.buildCopy(VReg, Register(PhysReg));
+ CCInfo.AllocateReg(PhysReg);
+ SrcRegs.push_back(VReg);
+ }
+
+ if (SrcRegs.size() > 1) {
+ B.buildMergeLikeInstr(VRegs[i][0], SrcRegs);
+ } else {
+ MRI.replaceRegWith(SrcRegs[0], VRegs[i][0]);
+ }
+ } else if (IsByRef) {
unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
assert(VRegs[i].size() == 1 &&
"expected only one register for byval pointers");
+
if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
lowerParameterPtr(VRegs[i][0], B, ArgOffset);
} else {
@@ -570,6 +607,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
}
TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
+
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 533ad349f7500..0d5c0a71ce7af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1193,9 +1193,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
/// for each individual part is i8. We pass the memory type as LocVT to the
/// calling convention analysis function and the register type (Ins[x].VT) as
/// the ValVT.
-void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
- CCState &State,
- const SmallVectorImpl<ISD::InputArg> &Ins) const {
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State) const {
const MachineFunction &MF = State.getMachineFunction();
const Function &Fn = MF.getFunction();
LLVMContext &Ctx = Fn.getParent()->getContext();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 6705f86e15fc2..2a69e8faab2a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -172,13 +172,11 @@ class AMDGPUTargetLowering : public TargetLowering {
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) const;
- void analyzeFormalArgumentsCompute(
- CCState &State,
- const SmallVectorImpl<ISD::InputArg> &Ins) const;
-
public:
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
+ void analyzeFormalArgumentsCompute(CCState &State) const;
+
bool mayIgnoreSignedZero(SDValue Op) const;
static inline SDValue stripBitcast(SDValue Val) {
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 157ca4b08020a..6c074ef8f396f 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1463,7 +1463,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
if (AMDGPU::isShader(CallConv)) {
CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
} else {
- analyzeFormalArgumentsCompute(CCInfo, Ins);
+ analyzeFormalArgumentsCompute(CCInfo);
}
for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 356040da95672..a2288d136418f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2929,7 +2929,7 @@ SDValue SITargetLowering::LowerFormalArguments(
}
if (IsKernel)
- analyzeFormalArgumentsCompute(CCInfo, Ins);
+ analyzeFormalArgumentsCompute(CCInfo);
if (IsEntryFunc) {
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index dd760c2a215ca..91fb3d0f84546 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -1,18 +1,32 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=ASM %s
-
-; OBJDUMP: Contents of section .rodata:
-; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000 ................
-; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NOT: 0030 0000af00 94130000 1a000400 00000000 ................
-; OBJDUMP-NEXT: 0030 8000af00 98130000 1e000400 00000000 ................
-
-; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
-; ASM: .amdhsa_user_sgpr_count 12
-; ASM: .amdhsa_next_free_sgpr 12
-; ASM: ; TotalNumSgprs: 18
-; ASM: ; NumSGPRsForWavesPerEU: 18
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJDUMP-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJDUMP-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=ASM-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=ASM-GISEL %s
+
+; OBJDUMP-SDAG: Contents of section .rodata:
+; OBJDUMP-SDAG-NEXT: 0000 00000000 00000000 10010000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 0010 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 0020 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NOT: 0030 0000af00 94130000 1a000400 00000000 ................
+; OBJDUMP-SDAG-NEXT: 0030 8000af00 98130000 1e000400 00000000 ................
+
+; OBJDUMP-GISEL: Contents of section .rodata:
+; OBJDUMP-GISEL-NEXT: 0000 00000000 00000000 10010000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 0010 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 0020 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 0030 0000af00 90130000 1e000000 00000000 ................
+
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_4_implicit_6:
+; ASM-SDAG: .amdhsa_user_sgpr_count 12
+; ASM-SDAG: .amdhsa_next_free_sgpr 12
+; ASM-SDAG: ; TotalNumSgprs: 18
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 18
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_4_implicit_6:
+; ASM-GISEL: .amdhsa_user_sgpr_count 8
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
; feild that are not explicitly referenced in the kernel. This test has 6 implicit
@@ -23,47 +37,80 @@
define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret void }
-; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000 ........ .......
-; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000 @...............
+; OBJDUMP-SDAG-NEXT: 0040 00000000 00000000 20010000 00000000 ........ .......
+; OBJDUMP-SDAG-NEXT: 0050 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 0060 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 0070 4000af00 94000000 08000800 00000000 @...............
-; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
-; ASM: .amdhsa_user_sgpr_count 10
-; ASM: .amdhsa_next_free_sgpr 10
-; ASM: ; TotalNumSgprs: 16
-; ASM: ; NumSGPRsForWavesPerEU: 16
+; OBJDUMP-GISEL-NEXT: 0040 00000000 00000000 20010000 00000000 ........ .......
+; OBJDUMP-GISEL-NEXT: 0050 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 0060 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 0070 0000af00 84000000 08000000 00000000 ................
+
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_8_implicit_2:
+; ASM-SDAG: .amdhsa_user_sgpr_count 10
+; ASM-SDAG: .amdhsa_next_free_sgpr 10
+; ASM-SDAG: ; TotalNumSgprs: 16
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 16
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_8_implicit_2:
+; ASM-GISEL: .amdhsa_user_sgpr_count 2
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
; implicit, and 6 extra.
define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 { ret void }
-; OBJDUMP-NEXT: 0080 00000000 00000000 08010000 00000000 ................
-; OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 00b0 4000af00 86000000 08000100 00000000 @...............
+; OBJDUMP-SDAG-NEXT: 0080 00000000 00000000 08010000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 0090 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 00a0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 00b0 4000af00 86000000 08000100 00000000 @...............
+
+; OBJDUMP-GISEL-NEXT: 0080 00000000 00000000 08010000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 0090 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 00a0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 00b0 0000af00 84000000 08000000 00000000 ................
-; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
-; ASM: .amdhsa_user_sgpr_count 3
-; ASM: .amdhsa_next_free_sgpr 3
-; ASM: ; TotalNumSgprs: 9
-; ASM: ; NumSGPRsForWavesPerEU: 9
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_1_implicit_2:
+; ASM-SDAG: .amdhsa_user_sgpr_count 3
+; ASM-SDAG: .amdhsa_next_free_sgpr 3
+; ASM-SDAG: ; TotalNumSgprs: 9
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 9
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_1_implicit_2:
+; ASM-GISEL: .amdhsa_user_sgpr_count 2
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { ret void }
-; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000 ................
-; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................
-
-; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
-; ASM: .amdhsa_user_sgpr_count 2
-; ASM: .amdhsa_next_free_sgpr 0
-; ASM: ; TotalNumSgprs: 6
-; ASM: ; NumSGPRsForWavesPerEU: 6
+; OBJDUMP-SDAG-NEXT: 00c0 00000000 00000000 08010000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 00d0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 00e0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................
+
+; OBJDUMP-GISEL-NEXT: 00c0 00000000 00000000 08010000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 00d0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 00e0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................
+
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_0_implicit_2:
+; ASM-SDAG: .amdhsa_user_sgpr_count 2
+; ASM-SDAG: .amdhsa_next_free_sgpr 0
+; ASM-SDAG: ; TotalNumSgprs: 6
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 6
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_0_implicit_2:
+; ASM-GISEL: .amdhsa_user_sgpr_count 2
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
; Encoded like '00'.
|
@arsenm do we want to skip kern arg preloading if the argument is not used? This kind of makes the tests in their current form a bit pointless. In that case we might want to test with this func instead: define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg %x, ptr addrspace(1) %p) {
%res = add i128 %x, %x
store i128 %res, ptr addrspace(1) %p
ret void
} |
I'm pretty sure you you need to preload a consecutive block, so you need to preload unused arguments if it is in the middle of other useful arguments to preload |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add gisel run lines or copies of the test preload-kernargs.ll
. This is crashing for me with your current changes.
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; | ||
|
||
// Preload this argument. | ||
const TargetRegisterClass *RC = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to handle sub-dword arguments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Took a step back and tried to make allocatePreloadKernArgSGPRs
work for both SDAG and GISel.
@arsenm Is there any documentation about the hardware and ABI besides the SDAG implementation? |
https://llvm.org/docs/AMDGPUUsage.html#preloaded-kernel-arguments |
5826b8c
to
68bb4b3
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
a01f97a
to
65b1328
Compare
b7ec96e
to
c2fac73
Compare
@kerbowa could you please have another look. Hidden arguments are now handled and the code paths for SDAG and GISel are now mostly the same. |
f4a2d6a
to
d90c571
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like it's in good shape, thanks! Appreciate the cleanup.
MemVT.getSimpleVT(), | ||
CCValAssign::Full)); | ||
State.addLoc(CCValAssign::getCustomMem( | ||
Arg.getArgNo(), RegisterVT, BasePartOffset + PartOffset, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Was ValNo just not used before and you have co-opt it to mean the original argument idx, or am I misunderstanding this change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. ValNo
is not used in the AMD backend and the docs in CCValAssign
suggest it should be an argument number.
d90c571
to
8650e2f
Compare
Register VReg = MRI.getLiveInVirtReg(PhysReg); | ||
TypeSize RegSize = TRI->getRegSizeInBits(VReg, MRI); | ||
|
||
if (!MRI.getVRegDef(VReg)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this should ever succeed. I also thought we had a helper to do getLiveInVirtReg, then copy if it doesn't already exist but I can't seem to find it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It does succeed in the case where we pack multiple args in one register and a COPY
from the physical register has already been generated in a previous iteration.
|
||
constexpr const unsigned SGPRSize = 4; | ||
// Arg is preloaded into the previous SGPR. | ||
if (DL.getTypeStoreSize(ArgTy) < SGPRSize && Alignment < SGPRSize) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Repeating the size check from above, but slightly different. Really should try to do this in terms of getNumRegsForCallingConv rather than type sizes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a calling convention that packs s16 args?
int64_t AlignDownOffset = alignDown(ArgOffset, SGPRSize); | ||
int64_t OffsetDiff = ArgOffset - AlignDownOffset; | ||
auto ShiftAmt = B.buildConstant(LLT::scalar(32), OffsetDiff * 8); | ||
auto Shift = B.buildRotateLeft(LLT::scalar(RegSize), VReg, ShiftAmt); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this really need the fancy shift? Can this just use a regular one? I don't see this in the DAG path
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't really see the need for SRL
instead of SHL
either maybe @kerbowa knows. Here is the SDAG path: https://github.com/llvm/llvm-project/blob/tim/kern-arg-preload/llvm/lib/Target/AMDGPU/SIISelLowering.cpp#L3044
if (SrcRegs.size() > 1) | ||
Res = B.buildMergeLikeInstr(MergeTy, SrcRegs).getReg(0); | ||
|
||
if (DL.getTypeStoreSizeInBits(ArgTy) < MergeTy.getSizeInBits()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Avoid doing another IR type size query, keep this all in terms of the LLTs
if (SrcRegs.size() > 1) | ||
Res = B.buildMergeLikeInstr(MergeTy, SrcRegs).getReg(0); | ||
|
||
if (DL.getTypeStoreSizeInBits(ArgTy) < MergeTy.getSizeInBits()) | ||
Res = B.buildTrunc(ScalarTy, Res).getReg(0); | ||
|
||
if (ResTy.isVector()) | ||
Res = B.buildBitcast(ResTy, Res).getReg(0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought MIRBuilder had nicer coercion helpers these days?
} | ||
|
||
if (Arg.hasAttribute("amdgpu-hidden-argument")) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Braces
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added
} | ||
|
||
if (Arg.hasAttribute("amdgpu-hidden-argument")) | ||
F.getContext().diagnose(DiagnosticInfoUnsupported( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DiagnosticInfoUnsupported is probably the wrong error kind, but this matches the DAG
LLT ResTy = getLLTForType(*ArgTy, DL); | ||
LLT ScalarTy = LLT::scalar(DL.getTypeSizeInBits(ArgTy)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't the most reliable way to get the register size, it should go through the calling convention type legalization. I guess this works out if we only allow preloaded on the most trivial types?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there even a documented underlying calling convention? This just mirrors what happens when we allocate in allocatePreloadKernArgSGPRs
. Does it make sense to diverge from what SDAG is doing at this point?
723e4eb
to
d6b7aa0
Compare