Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[AMDGPU][GlobalISel] Enable kernel argument preloading #134655

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from

Conversation

tgymnich
Copy link
Member

@tgymnich tgymnich commented Apr 7, 2025

  • enable kernel argument preloading

@llvmbot
Copy link
Member

llvmbot commented Apr 7, 2025

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Tim Gymnich (tgymnich)

Changes
  • enable kernel argument preloading

Full diff: https://github.com/llvm/llvm-project/pull/134655.diff

6 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp (+40-2)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+1-3)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (+2-4)
  • (modified) llvm/lib/Target/AMDGPU/R600ISelLowering.cpp (+1-1)
  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll (+90-43)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index a15f193549936..48c65f37d9ef8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 
@@ -507,6 +508,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   const DataLayout &DL = F.getDataLayout();
+  const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -520,7 +522,6 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
 
   // TODO: Align down to dword alignment and extract bits for extending loads.
   for (auto &Arg : F.args()) {
-    // TODO: Add support for kernarg preload.
     if (Arg.hasAttribute("amdgpu-hidden-argument")) {
       LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
       return false;
@@ -545,11 +546,47 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
 
     Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
 
-    if (IsByRef) {
+    if (Arg.hasInRegAttr() && IsEntryFunc && Subtarget->hasKernargPreload()) {
+      unsigned NumAllocSGPRs = alignTo(DL.getTypeSizeInBits(ArgTy), 32) / 32;
+
+      unsigned Padding = ArgOffset - BaseOffset;
+      unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
+
+      // Preload this argument.
+      const TargetRegisterClass *RC =
+          TRI->getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+      LLT RegTy = getLLTForType(*ArgTy, DL);
+      SmallVectorImpl<MCRegister> *PreloadRegs =
+          Info->addPreloadedKernArg(*TRI, RC, NumAllocSGPRs, i, PaddingSGPRs);
+
+      SmallVector<Register> SrcRegs;
+
+      if (PreloadRegs->size() > 1) {
+        RC = &AMDGPU::SGPR_32RegClass;
+        RegTy = LLT::scalar(32);
+      }
+
+      for (auto &PhysReg : *PreloadRegs) {
+        assert(PhysReg);
+        Register VReg = MF.addLiveIn(PhysReg, RC);
+        B.getMBB().addLiveIn(PhysReg);
+        MRI.setType(VReg, RegTy);
+        B.buildCopy(VReg, Register(PhysReg));
+        CCInfo.AllocateReg(PhysReg);
+        SrcRegs.push_back(VReg);
+      }
+
+      if (SrcRegs.size() > 1) {
+        B.buildMergeLikeInstr(VRegs[i][0], SrcRegs);
+      } else {
+        MRI.replaceRegWith(SrcRegs[0], VRegs[i][0]);
+      }
+    } else if (IsByRef) {
       unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
 
       assert(VRegs[i].size() == 1 &&
              "expected only one register for byval pointers");
+
       if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
         lowerParameterPtr(VRegs[i][0], B, ArgOffset);
       } else {
@@ -570,6 +607,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   }
 
   TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
+
   TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 533ad349f7500..0d5c0a71ce7af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1193,9 +1193,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
 /// for each individual part is i8.  We pass the memory type as LocVT to the
 /// calling convention analysis function and the register type (Ins[x].VT) as
 /// the ValVT.
-void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
-  CCState &State,
-  const SmallVectorImpl<ISD::InputArg> &Ins) const {
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State) const {
   const MachineFunction &MF = State.getMachineFunction();
   const Function &Fn = MF.getFunction();
   LLVMContext &Ctx = Fn.getParent()->getContext();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 6705f86e15fc2..2a69e8faab2a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -172,13 +172,11 @@ class AMDGPUTargetLowering : public TargetLowering {
   void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &Results) const;
 
-  void analyzeFormalArgumentsCompute(
-    CCState &State,
-    const SmallVectorImpl<ISD::InputArg> &Ins) const;
-
 public:
   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
+  void analyzeFormalArgumentsCompute(CCState &State) const;
+
   bool mayIgnoreSignedZero(SDValue Op) const;
 
   static inline SDValue stripBitcast(SDValue Val) {
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 157ca4b08020a..6c074ef8f396f 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1463,7 +1463,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
   if (AMDGPU::isShader(CallConv)) {
     CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
   } else {
-    analyzeFormalArgumentsCompute(CCInfo, Ins);
+    analyzeFormalArgumentsCompute(CCInfo);
   }
 
   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 356040da95672..a2288d136418f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2929,7 +2929,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   }
 
   if (IsKernel)
-    analyzeFormalArgumentsCompute(CCInfo, Ins);
+    analyzeFormalArgumentsCompute(CCInfo);
 
   if (IsEntryFunc) {
     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index dd760c2a215ca..91fb3d0f84546 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -1,18 +1,32 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=ASM %s
-
-; OBJDUMP: Contents of section .rodata:
-; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000  ................
-; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NOT:  0030 0000af00 94130000 1a000400 00000000  ................
-; OBJDUMP-NEXT: 0030 8000af00 98130000 1e000400 00000000  ................
-
-; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
-; ASM: .amdhsa_user_sgpr_count 12
-; ASM: .amdhsa_next_free_sgpr 12
-; ASM: ; TotalNumSgprs: 18
-; ASM: ; NumSGPRsForWavesPerEU: 18
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJDUMP-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJDUMP-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=ASM-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=ASM-GISEL %s
+
+; OBJDUMP-SDAG: Contents of section .rodata:
+; OBJDUMP-SDAG-NEXT: 0000 00000000 00000000 10010000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 0010 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 0020 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NOT:  0030 0000af00 94130000 1a000400 00000000  ................
+; OBJDUMP-SDAG-NEXT: 0030 8000af00 98130000 1e000400 00000000  ................
+
+; OBJDUMP-GISEL: Contents of section .rodata:
+; OBJDUMP-GISEL-NEXT: 0000 00000000 00000000 10010000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 0010 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 0020 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 0030 0000af00 90130000 1e000000 00000000  ................
+
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_4_implicit_6:
+; ASM-SDAG: .amdhsa_user_sgpr_count 12
+; ASM-SDAG: .amdhsa_next_free_sgpr 12
+; ASM-SDAG: ; TotalNumSgprs: 18
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 18
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_4_implicit_6:
+; ASM-GISEL: .amdhsa_user_sgpr_count 8
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
 
 ; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
 ; feild that are not explicitly referenced in the kernel. This test has 6 implicit
@@ -23,47 +37,80 @@
 
 define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret void }
 
-; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000  ........ .......
-; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000  @...............
+; OBJDUMP-SDAG-NEXT: 0040 00000000 00000000 20010000 00000000  ........ .......
+; OBJDUMP-SDAG-NEXT: 0050 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 0060 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 0070 4000af00 94000000 08000800 00000000  @...............
 
-; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
-; ASM: .amdhsa_user_sgpr_count 10
-; ASM: .amdhsa_next_free_sgpr 10
-; ASM: ; TotalNumSgprs: 16
-; ASM: ; NumSGPRsForWavesPerEU: 16
+; OBJDUMP-GISEL-NEXT: 0040 00000000 00000000 20010000 00000000  ........ .......
+; OBJDUMP-GISEL-NEXT: 0050 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 0060 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 0070 0000af00 84000000 08000000 00000000  ................
+
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_8_implicit_2:
+; ASM-SDAG: .amdhsa_user_sgpr_count 10
+; ASM-SDAG: .amdhsa_next_free_sgpr 10
+; ASM-SDAG: ; TotalNumSgprs: 16
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 16
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_8_implicit_2:
+; ASM-GISEL: .amdhsa_user_sgpr_count 2
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
 
 ; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
 ; implicit, and 6 extra.
 
 define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 { ret void }
 
-; OBJDUMP-NEXT: 0080 00000000 00000000 08010000 00000000  ................
-; OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00b0 4000af00 86000000 08000100 00000000  @...............
+; OBJDUMP-SDAG-NEXT: 0080 00000000 00000000 08010000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 0090 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 00a0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 00b0 4000af00 86000000 08000100 00000000  @...............
+
+; OBJDUMP-GISEL-NEXT: 0080 00000000 00000000 08010000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 0090 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 00a0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 00b0 0000af00 84000000 08000000 00000000  ................
 
-; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
-; ASM: .amdhsa_user_sgpr_count 3
-; ASM: .amdhsa_next_free_sgpr 3
-; ASM: ; TotalNumSgprs: 9
-; ASM: ; NumSGPRsForWavesPerEU: 9
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_1_implicit_2:
+; ASM-SDAG: .amdhsa_user_sgpr_count 3
+; ASM-SDAG: .amdhsa_next_free_sgpr 3
+; ASM-SDAG: ; TotalNumSgprs: 9
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 9
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_1_implicit_2:
+; ASM-GISEL: .amdhsa_user_sgpr_count 2
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
 
 ; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
 
 define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { ret void }
 
-; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000  ................
-; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000  ................
-
-; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
-; ASM: .amdhsa_user_sgpr_count 2
-; ASM: .amdhsa_next_free_sgpr 0
-; ASM: ; TotalNumSgprs: 6
-; ASM: ; NumSGPRsForWavesPerEU: 6
+; OBJDUMP-SDAG-NEXT: 00c0 00000000 00000000 08010000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 00d0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 00e0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-SDAG-NEXT: 00f0 0000af00 84000000 08000000 00000000  ................
+
+; OBJDUMP-GISEL-NEXT: 00c0 00000000 00000000 08010000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 00d0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 00e0 00000000 00000000 00000000 00000000  ................
+; OBJDUMP-GISEL-NEXT: 00f0 0000af00 84000000 08000000 00000000  ................
+
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_0_implicit_2:
+; ASM-SDAG: .amdhsa_user_sgpr_count 2
+; ASM-SDAG: .amdhsa_next_free_sgpr 0
+; ASM-SDAG: ; TotalNumSgprs: 6
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 6
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_0_implicit_2:
+; ASM-GISEL: .amdhsa_user_sgpr_count 2
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
 
 ; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
 ; Encoded like '00'.

@tgymnich
Copy link
Member Author

tgymnich commented Apr 7, 2025

@arsenm do we want to skip kern arg preloading if the argument is not used? This kind of makes the tests in their current form a bit pointless.

In that case we might want to test with this func instead:

define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg %x, ptr addrspace(1) %p) {
    %res = add i128 %x, %x
    store i128 %res, ptr addrspace(1) %p
    ret void
}

@arsenm
Copy link
Contributor

arsenm commented Apr 7, 2025

@arsenm do we want to skip kern arg preloading if the argument is not used? This kind of makes the tests in their current form a bit pointless.

I'm pretty sure you you need to preload a consecutive block, so you need to preload unused arguments if it is in the middle of other useful arguments to preload

@arsenm arsenm requested a review from kerbowa April 7, 2025 13:59
Copy link
Member

@kerbowa kerbowa left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add gisel run lines or copies of the test preload-kernargs.ll. This is crashing for me with your current changes.

unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;

// Preload this argument.
const TargetRegisterClass *RC =
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to handle sub-dword arguments.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Took a step back and tried to make allocatePreloadKernArgSGPRs work for both SDAG and GISel.

@tgymnich
Copy link
Member Author

tgymnich commented Apr 8, 2025

I'm pretty sure you you need to preload a consecutive block, so you need to preload unused arguments if it is in the middle of other useful arguments to preload

@arsenm Is there any documentation about the hardware and ABI besides the SDAG implementation?

@kerbowa
Copy link
Member

kerbowa commented Apr 8, 2025

I'm pretty sure you you need to preload a consecutive block, so you need to preload unused arguments if it is in the middle of other useful arguments to preload

@arsenm Is there any documentation about the hardware and ABI besides the SDAG implementation?

https://llvm.org/docs/AMDGPUUsage.html#preloaded-kernel-arguments

@tgymnich tgymnich force-pushed the tim/kern-arg-preload branch from 5826b8c to 68bb4b3 Compare April 10, 2025 16:50
Copy link

github-actions bot commented Apr 10, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@tgymnich tgymnich force-pushed the tim/kern-arg-preload branch 2 times, most recently from a01f97a to 65b1328 Compare April 11, 2025 09:24
@tgymnich tgymnich requested a review from kerbowa April 11, 2025 12:29
@tgymnich tgymnich force-pushed the tim/kern-arg-preload branch from b7ec96e to c2fac73 Compare April 16, 2025 07:09
@tgymnich
Copy link
Member Author

@kerbowa could you please have another look. Hidden arguments are now handled and the code paths for SDAG and GISel are now mostly the same.

@tgymnich tgymnich force-pushed the tim/kern-arg-preload branch from f4a2d6a to d90c571 Compare April 16, 2025 20:50
Copy link
Member

@kerbowa kerbowa left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like it's in good shape, thanks! Appreciate the cleanup.

MemVT.getSimpleVT(),
CCValAssign::Full));
State.addLoc(CCValAssign::getCustomMem(
Arg.getArgNo(), RegisterVT, BasePartOffset + PartOffset,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was ValNo just not used before and you have co-opt it to mean the original argument idx, or am I misunderstanding this change?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. ValNo is not used in the AMD backend and the docs in CCValAssign suggest it should be an argument number.

@tgymnich tgymnich force-pushed the tim/kern-arg-preload branch from d90c571 to 8650e2f Compare April 25, 2025 10:58
Register VReg = MRI.getLiveInVirtReg(PhysReg);
TypeSize RegSize = TRI->getRegSizeInBits(VReg, MRI);

if (!MRI.getVRegDef(VReg)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this should ever succeed. I also thought we had a helper to do getLiveInVirtReg, then copy if it doesn't already exist but I can't seem to find it

Copy link
Member Author

@tgymnich tgymnich Apr 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does succeed in the case where we pack multiple args in one register and a COPY from the physical register has already been generated in a previous iteration.


constexpr const unsigned SGPRSize = 4;
// Arg is preloaded into the previous SGPR.
if (DL.getTypeStoreSize(ArgTy) < SGPRSize && Alignment < SGPRSize) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Repeating the size check from above, but slightly different. Really should try to do this in terms of getNumRegsForCallingConv rather than type sizes

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a calling convention that packs s16 args?

int64_t AlignDownOffset = alignDown(ArgOffset, SGPRSize);
int64_t OffsetDiff = ArgOffset - AlignDownOffset;
auto ShiftAmt = B.buildConstant(LLT::scalar(32), OffsetDiff * 8);
auto Shift = B.buildRotateLeft(LLT::scalar(RegSize), VReg, ShiftAmt);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this really need the fancy shift? Can this just use a regular one? I don't see this in the DAG path

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really see the need for SRL instead of SHL either maybe @kerbowa knows. Here is the SDAG path: https://github.com/llvm/llvm-project/blob/tim/kern-arg-preload/llvm/lib/Target/AMDGPU/SIISelLowering.cpp#L3044

if (SrcRegs.size() > 1)
Res = B.buildMergeLikeInstr(MergeTy, SrcRegs).getReg(0);

if (DL.getTypeStoreSizeInBits(ArgTy) < MergeTy.getSizeInBits())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid doing another IR type size query, keep this all in terms of the LLTs

Comment on lines 550 to 557
if (SrcRegs.size() > 1)
Res = B.buildMergeLikeInstr(MergeTy, SrcRegs).getReg(0);

if (DL.getTypeStoreSizeInBits(ArgTy) < MergeTy.getSizeInBits())
Res = B.buildTrunc(ScalarTy, Res).getReg(0);

if (ResTy.isVector())
Res = B.buildBitcast(ResTy, Res).getReg(0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought MIRBuilder had nicer coercion helpers these days?

}

if (Arg.hasAttribute("amdgpu-hidden-argument"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Braces

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

}

if (Arg.hasAttribute("amdgpu-hidden-argument"))
F.getContext().diagnose(DiagnosticInfoUnsupported(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DiagnosticInfoUnsupported is probably the wrong error kind, but this matches the DAG

Comment on lines 512 to 513
LLT ResTy = getLLTForType(*ArgTy, DL);
LLT ScalarTy = LLT::scalar(DL.getTypeSizeInBits(ArgTy));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't the most reliable way to get the register size, it should go through the calling convention type legalization. I guess this works out if we only allow preloaded on the most trivial types?

Copy link
Member Author

@tgymnich tgymnich Apr 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there even a documented underlying calling convention? This just mirrors what happens when we allocate in allocatePreloadKernArgSGPRs. Does it make sense to diverge from what SDAG is doing at this point?

@tgymnich tgymnich force-pushed the tim/kern-arg-preload branch from 723e4eb to d6b7aa0 Compare May 5, 2025 13:26
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants