[AMDGPU][GlobalISel] Enable kernel argument preloading #134655

tgymnich · 2025-04-07T13:52:33Z

enable kernel argument preloading

llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll

llvmbot · 2025-04-07T13:53:09Z

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Tim Gymnich (tgymnich)

Changes

enable kernel argument preloading

Full diff: https://github.com/llvm/llvm-project/pull/134655.diff

6 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp (+40-2)
(modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+1-3)
(modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (+2-4)
(modified) llvm/lib/Target/AMDGPU/R600ISelLowering.cpp (+1-1)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+1-1)
(modified) llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll (+90-43)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index a15f193549936..48c65f37d9ef8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 
@@ -507,6 +508,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   const DataLayout &DL = F.getDataLayout();
+  const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -520,7 +522,6 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
 
   // TODO: Align down to dword alignment and extract bits for extending loads.
   for (auto &Arg : F.args()) {
-    // TODO: Add support for kernarg preload.
     if (Arg.hasAttribute("amdgpu-hidden-argument")) {
       LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
       return false;
@@ -545,11 +546,47 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
 
     Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
 
-    if (IsByRef) {
+    if (Arg.hasInRegAttr() && IsEntryFunc && Subtarget->hasKernargPreload()) {
+      unsigned NumAllocSGPRs = alignTo(DL.getTypeSizeInBits(ArgTy), 32) / 32;
+
+      unsigned Padding = ArgOffset - BaseOffset;
+      unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
+
+      // Preload this argument.
+      const TargetRegisterClass *RC =
+          TRI->getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+      LLT RegTy = getLLTForType(*ArgTy, DL);
+      SmallVectorImpl<MCRegister> *PreloadRegs =
+          Info->addPreloadedKernArg(*TRI, RC, NumAllocSGPRs, i, PaddingSGPRs);
+
+      SmallVector<Register> SrcRegs;
+
+      if (PreloadRegs->size() > 1) {
+        RC = &AMDGPU::SGPR_32RegClass;
+        RegTy = LLT::scalar(32);
+      }
+
+      for (auto &PhysReg : *PreloadRegs) {
+        assert(PhysReg);
+        Register VReg = MF.addLiveIn(PhysReg, RC);
+        B.getMBB().addLiveIn(PhysReg);
+        MRI.setType(VReg, RegTy);
+        B.buildCopy(VReg, Register(PhysReg));
+        CCInfo.AllocateReg(PhysReg);
+        SrcRegs.push_back(VReg);
+      }
+
+      if (SrcRegs.size() > 1) {
+        B.buildMergeLikeInstr(VRegs[i][0], SrcRegs);
+      } else {
+        MRI.replaceRegWith(SrcRegs[0], VRegs[i][0]);
+      }
+    } else if (IsByRef) {
       unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
 
       assert(VRegs[i].size() == 1 &&
              "expected only one register for byval pointers");
+
       if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
         lowerParameterPtr(VRegs[i][0], B, ArgOffset);
       } else {
@@ -570,6 +607,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   }
 
   TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
+
   TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 533ad349f7500..0d5c0a71ce7af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1193,9 +1193,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
 /// for each individual part is i8.  We pass the memory type as LocVT to the
 /// calling convention analysis function and the register type (Ins[x].VT) as
 /// the ValVT.
-void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
-  CCState &State,
-  const SmallVectorImpl<ISD::InputArg> &Ins) const {
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State) const {
   const MachineFunction &MF = State.getMachineFunction();
   const Function &Fn = MF.getFunction();
   LLVMContext &Ctx = Fn.getParent()->getContext();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 6705f86e15fc2..2a69e8faab2a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -172,13 +172,11 @@ class AMDGPUTargetLowering : public TargetLowering {
   void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &Results) const;
 
-  void analyzeFormalArgumentsCompute(
-    CCState &State,
-    const SmallVectorImpl<ISD::InputArg> &Ins) const;
-
 public:
   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
+  void analyzeFormalArgumentsCompute(CCState &State) const;
+
   bool mayIgnoreSignedZero(SDValue Op) const;
 
   static inline SDValue stripBitcast(SDValue Val) {
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 157ca4b08020a..6c074ef8f396f 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1463,7 +1463,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
   if (AMDGPU::isShader(CallConv)) {
     CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
   } else {
-    analyzeFormalArgumentsCompute(CCInfo, Ins);
+    analyzeFormalArgumentsCompute(CCInfo);
   }
 
   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 356040da95672..a2288d136418f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2929,7 +2929,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   }
 
   if (IsKernel)
-    analyzeFormalArgumentsCompute(CCInfo, Ins);
+    analyzeFormalArgumentsCompute(CCInfo);
 
   if (IsEntryFunc) {
     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index dd760c2a215ca..91fb3d0f84546 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -1,18 +1,32 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=ASM %s
-
-; OBJDUMP: Contents of section .rodata:
-; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000  ................
-; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NOT:  0030 0000af00 94130000 1a000400 00000000  ................
-; OBJDUMP-NEXT: 0030 8000af00 98130000 1e000400 00000000  ................
-
-; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
-; ASM: .amdhsa_user_sgpr_count 12
-; ASM: .amdhsa_next_free_sgpr 12
-; ASM: ; TotalNumSgprs: 18
-; ASM: ; NumSGPRsForWavesPerEU: 18
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJDUMP-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJDUMP-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=ASM-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=ASM-GISEL %s
+
+; OBJDUMP-SDAG: Contents of section .rodata:
+; OBJDUMP-SDAG-NEXT: 0000 00000000 00000000 10010000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 0010 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 0020 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NOT:  0030 0000af00 94130000 1a000400 00000000  ................
+; OBJDUMP-SDAG-NEXT: 0030 8000af00 98130000 1e000400 00000000  ................
+
+; OBJDUMP-GISEL: Contents of section .rodata:
+; OBJDUMP-GISEL-NEXT: 0000 00000000 00000000 10010000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 0010 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 0020 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 0030 0000af00 90130000 1e000000 00000000  ................
+
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_4_implicit_6:
+; ASM-SDAG: .amdhsa_user_sgpr_count 12
+; ASM-SDAG: .amdhsa_next_free_sgpr 12
+; ASM-SDAG: ; TotalNumSgprs: 18
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 18
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_4_implicit_6:
+; ASM-GISEL: .amdhsa_user_sgpr_count 8
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
 
 ; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
 ; feild that are not explicitly referenced in the kernel. This test has 6 implicit
@@ -23,47 +37,80 @@
 
 define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret void }
 
-; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000  ........ .......
-; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000  @...............
+; OBJDUMP-SDAG-NEXT: 0040 00000000 00000000 20010000 00000000  ........ .......
+; OBJDUMP-SDAG-NEXT: 0050 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 0060 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 0070 4000af00 94000000 08000800 00000000  @...............
 
-; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
-; ASM: .amdhsa_user_sgpr_count 10
-; ASM: .amdhsa_next_free_sgpr 10
-; ASM: ; TotalNumSgprs: 16
-; ASM: ; NumSGPRsForWavesPerEU: 16
+; OBJDUMP-GISEL-NEXT: 0040 00000000 00000000 20010000 00000000  ........ .......
+; OBJDUMP-GISEL-NEXT: 0050 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 0060 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 0070 0000af00 84000000 08000000 00000000  ................
+
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_8_implicit_2:
+; ASM-SDAG: .amdhsa_user_sgpr_count 10
+; ASM-SDAG: .amdhsa_next_free_sgpr 10
+; ASM-SDAG: ; TotalNumSgprs: 16
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 16
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_8_implicit_2:
+; ASM-GISEL: .amdhsa_user_sgpr_count 2
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
 
 ; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
 ; implicit, and 6 extra.
 
 define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 { ret void }
 
-; OBJDUMP-NEXT: 0080 00000000 00000000 08010000 00000000  ................
-; OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00b0 4000af00 86000000 08000100 00000000  @...............
+; OBJDUMP-SDAG-NEXT: 0080 00000000 00000000 08010000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 0090 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 00a0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 00b0 4000af00 86000000 08000100 00000000  @...............
+
+; OBJDUMP-GISEL-NEXT: 0080 00000000 00000000 08010000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 0090 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 00a0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 00b0 0000af00 84000000 08000000 00000000  ................
 
-; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
-; ASM: .amdhsa_user_sgpr_count 3
-; ASM: .amdhsa_next_free_sgpr 3
-; ASM: ; TotalNumSgprs: 9
-; ASM: ; NumSGPRsForWavesPerEU: 9
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_1_implicit_2:
+; ASM-SDAG: .amdhsa_user_sgpr_count 3
+; ASM-SDAG: .amdhsa_next_free_sgpr 3
+; ASM-SDAG: ; TotalNumSgprs: 9
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 9
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_1_implicit_2:
+; ASM-GISEL: .amdhsa_user_sgpr_count 2
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
 
 ; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
 
 define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { ret void }
 
-; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000  ................
-; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000  ................
-
-; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
-; ASM: .amdhsa_user_sgpr_count 2
-; ASM: .amdhsa_next_free_sgpr 0
-; ASM: ; TotalNumSgprs: 6
-; ASM: ; NumSGPRsForWavesPerEU: 6
+; OBJDUMP-SDAG-NEXT: 00c0 00000000 00000000 08010000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 00d0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 00e0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 00f0 0000af00 84000000 08000000 00000000  ................
+
+; OBJDUMP-GISEL-NEXT: 00c0 00000000 00000000 08010000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 00d0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 00e0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 00f0 0000af00 84000000 08000000 00000000  ................
+
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_0_implicit_2:
+; ASM-SDAG: .amdhsa_user_sgpr_count 2
+; ASM-SDAG: .amdhsa_next_free_sgpr 0
+; ASM-SDAG: ; TotalNumSgprs: 6
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 6
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_0_implicit_2:
+; ASM-GISEL: .amdhsa_user_sgpr_count 2
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
 
 ; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
 ; Encoded like '00'.

tgymnich · 2025-04-07T13:55:15Z

@arsenm do we want to skip kern arg preloading if the argument is not used? This kind of makes the tests in their current form a bit pointless.

In that case we might want to test with this func instead:

define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg %x, ptr addrspace(1) %p) {
    %res = add i128 %x, %x
    store i128 %res, ptr addrspace(1) %p
    ret void
}

arsenm · 2025-04-07T13:59:54Z

@arsenm do we want to skip kern arg preloading if the argument is not used? This kind of makes the tests in their current form a bit pointless.

I'm pretty sure you you need to preload a consecutive block, so you need to preload unused arguments if it is in the middle of other useful arguments to preload

kerbowa

Can you add gisel run lines or copies of the test preload-kernargs.ll. This is crashing for me with your current changes.

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

kerbowa · 2025-04-07T15:54:23Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+      unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
+
+      // Preload this argument.
+      const TargetRegisterClass *RC =


Need to handle sub-dword arguments.

Took a step back and tried to make allocatePreloadKernArgSGPRs work for both SDAG and GISel.

tgymnich · 2025-04-08T08:48:21Z

I'm pretty sure you you need to preload a consecutive block, so you need to preload unused arguments if it is in the middle of other useful arguments to preload

@arsenm Is there any documentation about the hardware and ABI besides the SDAG implementation?

kerbowa · 2025-04-08T15:48:20Z

I'm pretty sure you you need to preload a consecutive block, so you need to preload unused arguments if it is in the middle of other useful arguments to preload

@arsenm Is there any documentation about the hardware and ABI besides the SDAG implementation?

https://llvm.org/docs/AMDGPUUsage.html#preloaded-kernel-arguments

github-actions · 2025-04-10T16:53:25Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

tgymnich · 2025-04-16T07:11:40Z

@kerbowa could you please have another look. Hidden arguments are now handled and the code paths for SDAG and GISel are now mostly the same.

kerbowa

This looks like it's in good shape, thanks! Appreciate the cleanup.

kerbowa · 2025-04-25T04:54:15Z

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

-                                               MemVT.getSimpleVT(),
-                                               CCValAssign::Full));
+        State.addLoc(CCValAssign::getCustomMem(
+            Arg.getArgNo(), RegisterVT, BasePartOffset + PartOffset,


Was ValNo just not used before and you have co-opt it to mean the original argument idx, or am I misunderstanding this change?

Yes. ValNo is not used in the AMD backend and the docs in CCValAssign suggest it should be an argument number.

arsenm · 2025-04-25T11:19:23Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+    Register VReg = MRI.getLiveInVirtReg(PhysReg);
+    TypeSize RegSize = TRI->getRegSizeInBits(VReg, MRI);
+
+    if (!MRI.getVRegDef(VReg)) {


I don't think this should ever succeed. I also thought we had a helper to do getLiveInVirtReg, then copy if it doesn't already exist but I can't seem to find it

It does succeed in the case where we pack multiple args in one register and a COPY from the physical register has already been generated in a previous iteration.

arsenm · 2025-04-25T11:23:36Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+
+    constexpr const unsigned SGPRSize = 4;
+    // Arg is preloaded into the previous SGPR.
+    if (DL.getTypeStoreSize(ArgTy) < SGPRSize && Alignment < SGPRSize) {


Repeating the size check from above, but slightly different. Really should try to do this in terms of getNumRegsForCallingConv rather than type sizes

Is there a calling convention that packs s16 args?

arsenm · 2025-04-25T11:24:34Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+      int64_t AlignDownOffset = alignDown(ArgOffset, SGPRSize);
+      int64_t OffsetDiff = ArgOffset - AlignDownOffset;
+      auto ShiftAmt = B.buildConstant(LLT::scalar(32), OffsetDiff * 8);
+      auto Shift = B.buildRotateLeft(LLT::scalar(RegSize), VReg, ShiftAmt);


Does this really need the fancy shift? Can this just use a regular one? I don't see this in the DAG path

I don't really see the need for SRL instead of SHL either maybe @kerbowa knows. Here is the SDAG path: https://github.com/llvm/llvm-project/blob/tim/kern-arg-preload/llvm/lib/Target/AMDGPU/SIISelLowering.cpp#L3044

arsenm · 2025-04-25T11:25:23Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+  if (SrcRegs.size() > 1)
+    Res = B.buildMergeLikeInstr(MergeTy, SrcRegs).getReg(0);
+
+  if (DL.getTypeStoreSizeInBits(ArgTy) < MergeTy.getSizeInBits())


Avoid doing another IR type size query, keep this all in terms of the LLTs

arsenm · 2025-04-25T11:25:42Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+  if (SrcRegs.size() > 1)
+    Res = B.buildMergeLikeInstr(MergeTy, SrcRegs).getReg(0);
+
+  if (DL.getTypeStoreSizeInBits(ArgTy) < MergeTy.getSizeInBits())
+    Res = B.buildTrunc(ScalarTy, Res).getReg(0);
+
+  if (ResTy.isVector())
+    Res = B.buildBitcast(ResTy, Res).getReg(0);


I thought MIRBuilder had nicer coercion helpers these days?

arsenm · 2025-04-25T11:26:00Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

    }

+    if (Arg.hasAttribute("amdgpu-hidden-argument"))


arsenm · 2025-04-25T11:26:25Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

    }

+    if (Arg.hasAttribute("amdgpu-hidden-argument"))
+      F.getContext().diagnose(DiagnosticInfoUnsupported(


DiagnosticInfoUnsupported is probably the wrong error kind, but this matches the DAG

arsenm · 2025-04-25T11:27:53Z

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

+  LLT ResTy = getLLTForType(*ArgTy, DL);
+  LLT ScalarTy = LLT::scalar(DL.getTypeSizeInBits(ArgTy));


This isn't the most reliable way to get the register size, it should go through the calling convention type legalization. I guess this works out if we only allow preloaded on the most trivial types?

Is there even a documented underlying calling convention? This just mirrors what happens when we allocate in allocatePreloadKernArgSGPRs. Does it make sense to diverge from what SDAG is doing at this point?

llvmbot added the backend:AMDGPU label Apr 7, 2025

tgymnich commented Apr 7, 2025

View reviewed changes

llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll Outdated Show resolved Hide resolved

arsenm requested a review from kerbowa April 7, 2025 13:59

arsenm added the llvm:globalisel label Apr 7, 2025

kerbowa reviewed Apr 7, 2025

View reviewed changes

tgymnich force-pushed the tim/kern-arg-preload branch from 5826b8c to 68bb4b3 Compare April 10, 2025 16:50

tgymnich force-pushed the tim/kern-arg-preload branch 2 times, most recently from a01f97a to 65b1328 Compare April 11, 2025 09:24

tgymnich requested a review from kerbowa April 11, 2025 12:29

shiltian reviewed Apr 11, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp Outdated Show resolved Hide resolved

shiltian reviewed Apr 11, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp Outdated Show resolved Hide resolved

shiltian reviewed Apr 11, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp Outdated Show resolved Hide resolved

shiltian reviewed Apr 11, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp Show resolved Hide resolved

tgymnich force-pushed the tim/kern-arg-preload branch from b7ec96e to c2fac73 Compare April 16, 2025 07:09

tgymnich force-pushed the tim/kern-arg-preload branch from f4a2d6a to d90c571 Compare April 16, 2025 20:50

kerbowa reviewed Apr 25, 2025

View reviewed changes

tgymnich force-pushed the tim/kern-arg-preload branch from d90c571 to 8650e2f Compare April 25, 2025 10:58

arsenm reviewed Apr 25, 2025

View reviewed changes

tgymnich added 5 commits May 5, 2025 09:25

[AMDGPU][GlobalISel] Enable kernel argument preloading

1fbffac

update tests

286124c

clang format

3ee3d07

add braces

d7d3857

express type size constraints in terms of ResTy LLT

d6b7aa0

tgymnich force-pushed the tim/kern-arg-preload branch from 723e4eb to d6b7aa0 Compare May 5, 2025 13:26

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU][GlobalISel] Enable kernel argument preloading #134655

[AMDGPU][GlobalISel] Enable kernel argument preloading #134655

tgymnich commented Apr 7, 2025

llvmbot commented Apr 7, 2025 •

edited

Loading

tgymnich commented Apr 7, 2025

arsenm commented Apr 7, 2025

kerbowa left a comment

kerbowa Apr 7, 2025

tgymnich Apr 10, 2025

tgymnich commented Apr 8, 2025 •

edited

Loading

kerbowa commented Apr 8, 2025

github-actions bot commented Apr 10, 2025 •

edited

Loading

tgymnich commented Apr 16, 2025

kerbowa left a comment

kerbowa Apr 25, 2025

tgymnich Apr 25, 2025

arsenm Apr 25, 2025

tgymnich Apr 25, 2025 •

edited

Loading

arsenm Apr 25, 2025

tgymnich Apr 25, 2025

arsenm Apr 25, 2025

tgymnich Apr 25, 2025

arsenm Apr 25, 2025

arsenm Apr 25, 2025

arsenm Apr 25, 2025

tgymnich Apr 25, 2025

arsenm Apr 25, 2025

arsenm Apr 25, 2025

tgymnich Apr 25, 2025 •

edited

Loading

		LLT ResTy = getLLTForType(*ArgTy, DL);
		LLT ScalarTy = LLT::scalar(DL.getTypeSizeInBits(ArgTy));

[AMDGPU][GlobalISel] Enable kernel argument preloading #134655

Are you sure you want to change the base?

[AMDGPU][GlobalISel] Enable kernel argument preloading #134655

Conversation

tgymnich commented Apr 7, 2025

llvmbot commented Apr 7, 2025 • edited Loading

tgymnich commented Apr 7, 2025

arsenm commented Apr 7, 2025

kerbowa left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

tgymnich commented Apr 8, 2025 • edited Loading

kerbowa commented Apr 8, 2025

github-actions bot commented Apr 10, 2025 • edited Loading

tgymnich commented Apr 16, 2025

kerbowa left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

tgymnich Apr 25, 2025 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

tgymnich Apr 25, 2025 • edited Loading

Choose a reason for hiding this comment

llvmbot commented Apr 7, 2025 •

edited

Loading

tgymnich commented Apr 8, 2025 •

edited

Loading

github-actions bot commented Apr 10, 2025 •

edited

Loading

tgymnich Apr 25, 2025 •

edited

Loading

tgymnich Apr 25, 2025 •

edited

Loading