[NVPTX] Add errors for incorrect CUDA addrpaces #138706

LewisCrawford · 2025-05-06T15:18:43Z

The CUDA API only accepts kernel params in the global and generic address spaces, so display an error message when attempting to emit pointers outside those address-spaces from CUDA (but still allow them for OpenCL).

llvmbot · 2025-05-06T15:19:27Z

@llvm/pr-subscribers-backend-nvptx

Author: Lewis Crawford (LewisCrawford)

Changes

The CUDA API only accepts kernel params in the global and generic address spaces, so display an error message when attempting to emit pointers outside those address-spaces from CUDA (but still allow them for OpenCL).

Full diff: https://github.com/llvm/llvm-project/pull/138706.diff

5 Files Affected:

(modified) llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp (+8)
(modified) llvm/test/CodeGen/NVPTX/kernel-param-align.ll (+3-2)
(added) llvm/test/CodeGen/NVPTX/lower-args-cuda.ll (+13)
(added) llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll (+17)
(modified) llvm/test/CodeGen/NVPTX/lower-args.ll (-23)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 2f4b109e8e9e9..9e17adb6ac1ae 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1399,6 +1399,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       if (PTy) {
         O << "\t.param .u" << PTySizeInBits << " .ptr";
 
+        bool IsCUDA = static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() ==
+                      NVPTX::CUDA;
         switch (PTy->getAddressSpace()) {
         default:
           break;
@@ -1406,12 +1408,18 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
           O << " .global";
           break;
         case ADDRESS_SPACE_SHARED:
+          if (IsCUDA)
+            report_fatal_error(".shared ptr kernel args unsupported in CUDA.");
           O << " .shared";
           break;
         case ADDRESS_SPACE_CONST:
+          if (IsCUDA)
+            report_fatal_error(".const ptr kernel args unsupported in CUDA.");
           O << " .const";
           break;
         case ADDRESS_SPACE_LOCAL:
+          if (IsCUDA)
+            report_fatal_error(".local ptr kernel args unsupported in CUDA.");
           O << " .local";
           break;
         }
diff --git a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
index a56b85de80143..e85ccf34bb6ac 100644
--- a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas -arch=sm_60 - %}
+; RUN: llc < %s -mcpu=sm_60 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_60 | %ptxas -arch=sm_60 - %}
+target triple = "nvptx64-nvidia-nvcl"
 
 %struct.Large = type { [16 x double] }
 
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll b/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll
new file mode 100644
index 0000000000000..7361ab28badb9
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll
@@ -0,0 +1,13 @@
+; RUN: not --crash llc < %s -mcpu=sm_75  -o /dev/null 2>&1 | FileCheck %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; Make sure we exit with an error message for this input, as pointers to the
+; shared address-space are only supported as kernel args in NVCL, not CUDA.
+; CHECK:  .shared ptr kernel args unsupported in CUDA.
+define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+  %v = load i32, ptr addrspace(3) %in, align 4
+  store i32 %v, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll b/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll
new file mode 100644
index 0000000000000..44b44e0c17626
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -S -nvptx-lower-args | FileCheck %s --check-prefixes COMMON,IR
+; RUN: llc < %s -mcpu=sm_20 | FileCheck %s --check-prefixes COMMON,PTX
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-nvcl"
+
+; COMMON-LABEL: ptr_nongeneric
+define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; IR-NOT: addrspacecast
+; PTX-NOT: cvta.to.global
+; PTX:  ld.shared.u32
+; PTX   st.global.u32
+  %v = load i32, ptr addrspace(3) %in, align 4
+  store i32 %v, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 8e879871e295b..44445a17d1eb3 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -140,29 +140,6 @@ define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
   ret void
 }
 
-define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) {
-; IR-LABEL: define ptx_kernel void @ptr_nongeneric(
-; IR-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
-; IR-NEXT:    [[V:%.*]] = load i32, ptr addrspace(3) [[IN]], align 4
-; IR-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; IR-NEXT:    ret void
-;
-; PTX-LABEL: ptr_nongeneric(
-; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<2>;
-; PTX-NEXT:    .reg .b64 %rd<3>;
-; PTX-EMPTY:
-; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.param.u64 %rd1, [ptr_nongeneric_param_0];
-; PTX-NEXT:    ld.param.u64 %rd2, [ptr_nongeneric_param_1];
-; PTX-NEXT:    ld.shared.u32 %r1, [%rd2];
-; PTX-NEXT:    st.global.u32 [%rd1], %r1;
-; PTX-NEXT:    ret;
-  %v = load i32, ptr addrspace(3) %in, align 4
-  store i32 %v, ptr addrspace(1) %out, align 4
-  ret void
-}
-
 define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
 ; IRC-LABEL: define ptx_kernel void @ptr_as_int(
 ; IRC-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) {

gonzalobg

LGTM, thank you Lewis.

AlexMaclean · 2025-05-06T15:57:19Z

One thing I wonder is whether this sort of logic really belongs in assembly printing. It seems like this verification could happen at any point during compilation, such as during an IR pass. Are there examples from other targets of target-specific IR verification? Do you have any thoughts about alternative locations for this verification?

Artem-B · 2025-05-06T23:35:55Z

llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp

+          if (IsCUDA)
+            report_fatal_error(".const ptr kernel args unsupported in CUDA.");


While local/shared pointers have no meaning outside of thread/CTA. .const pointers do exist and can be passed around.

For kernel parameters?

We have .param::grid_constant for __grid_constant__ kernel parameters, but that's different than .const.

On a second thought, I'm not quite convinced that we want this change.

The CUDA API only accepts kernel params in the global and generic address spaces, so display an error message when attempting to emit pointers outside those address-spaces from CUDA (but still allow them for OpenCL).

@LewisCrawford can you elaborate on the motivation and the use case for this change?

CUDA operates on generic pointers, as far as the front-end is concerned, so I'm not quite sure which CUDA APIs you have in mind here that may be affected if we generate .ptr .const?

Can you give me a few examples where these errors should be triggered when we compile CUDA sources? The included tests all operate on the IR level, so I can't tell what is the problem that this patch wants to address.

On the IR level, erroring out does not make whole lot of sense to me. If the compilation of the CUDA sources can lead to such an IR, it should be diagnosed by the front-end. If we made this far in the compilation pipeline, I do not see why we would want to error out here, attempting to generate what appears to be a perfectly valid PTX.
What can possibly go wrong here that it's LLVM's job to prevent?

IIRC this change was motivated as a quality of life improvement for developers of non-cuda frontends (openmp/-acc offloading, c++ parallel algorithms, fortran do-concurrent...) that support a variety of backends (cuda ptx, opencl, ...). The set of state spaces each such backend supports for pointers passed as kernel arguments differs, leading to bugs in those frontends. IIRC, not all ptxas versions diagnose these sort of issues correctly, making the life of those frontend developers unnecessarily miserable (see also EDIT2 below). The LLVM NVPTX backend does support a wide range of ptxas versions, so detecting these issues there and providing a clear and concise error message about what went wrong seemed valuable as a "best effort" improvement.

EDIT: I agree that a correct frontend should never trigger these.
EDIT2: IIRC, this also caused churn for those frontend devs as ptxas was fixed to detect these better, because some of those frontends generate the ptx ahead of time, but the application then jits the ptx at runtime, so code that properly compiled to PTX locally, then failed at a customer if that customer had a ptxas with a better detection mechanism. Detecting this in the NVPTX backend makes these bugs in these frontends easier to reliably detect and fix, independently of which ptxas version the frontend developer has.

quality of life improvement for developers of non-cuda frontends (openmp/-acc offloading, c++ parallel algorithms, fortran do-concurrent...) that support a variety of backends (cuda ptx, opencl, ...).

Then it makes little sense to apply those error checks to the CUDA front-end only. They do not change anything for those non-CUDA front-ends, because they are not CUDA. Or they should not be using -cuda triple because they are following some other kind of calling convention. The checks also do nothing for the CUDA front-end itself because it never generates such pointer arguments. So, what are we fixing there. Real, specific examples would help. So far, it looks like a case of someone misusing CUDA-specific triple, and now we're fixing the corner cases in misuse scenarios, instead of properly fixing the way those front-ends interact with LLVM.

EDIT2: IIRC, this also caused churn for those frontend devs as ptxas was fixed to detect these better, because some of those frontends generate the ptx ahead of time, but

If the font-ends generate PTX, then this change will also be irrelevant, as it applies to IR only. Whatever user may have put into inline asm is opaque to us, and if it's just a text blob passed to the ptxas, then LLVM does not touch it at all.

So, we're left with use cases where IR uses non-generic kernel pointer arguments. If we do want to diagnose those, it may have some merit, but it should be clearly documented what we're doing and why. E.g. if we're appealing to CUDA front-end as the source of those restrictions, then we should ban all non-generic pointers. If it's the PTXAS that determines which pointer variants are acceptable, I'd like to see ptx documentation saying that. Right now the patch is neither here nor there.

[NVPTX] Add errors for incorrect CUDA addrpaces

c67fc1c

The CUDA API only accepts kernel params in the global and generic address spaces, so display an error message when attempting to emit pointers outside those address-spaces from CUDA (but still allow them for OpenCL).

llvmbot added the backend:NVPTX label May 6, 2025

LewisCrawford requested a review from Artem-B May 6, 2025 15:19

LewisCrawford requested a review from AlexMaclean May 6, 2025 15:23

LewisCrawford mentioned this pull request May 6, 2025

Enable .ptr .global .align attributes for kernel attributes for CUDA #114874

Merged

gonzalobg approved these changes May 6, 2025

View reviewed changes

Artem-B reviewed May 6, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[NVPTX] Add errors for incorrect CUDA addrpaces #138706

[NVPTX] Add errors for incorrect CUDA addrpaces #138706

LewisCrawford commented May 6, 2025

llvmbot commented May 6, 2025

gonzalobg left a comment

AlexMaclean commented May 6, 2025

Artem-B May 6, 2025

gonzalobg May 7, 2025 •

edited

Loading

Artem-B May 7, 2025

gonzalobg May 8, 2025 •

edited

Loading

Artem-B May 8, 2025

		if (IsCUDA)
		report_fatal_error(".const ptr kernel args unsupported in CUDA.");

[NVPTX] Add errors for incorrect CUDA addrpaces #138706

Are you sure you want to change the base?

[NVPTX] Add errors for incorrect CUDA addrpaces #138706

Conversation

LewisCrawford commented May 6, 2025

llvmbot commented May 6, 2025

gonzalobg left a comment

Choose a reason for hiding this comment

AlexMaclean commented May 6, 2025

Artem-B May 6, 2025

Choose a reason for hiding this comment

gonzalobg May 7, 2025 • edited Loading

Choose a reason for hiding this comment

Artem-B May 7, 2025

Choose a reason for hiding this comment

gonzalobg May 8, 2025 • edited Loading

Choose a reason for hiding this comment

Artem-B May 8, 2025

Choose a reason for hiding this comment

gonzalobg May 7, 2025 •

edited

Loading

gonzalobg May 8, 2025 •

edited

Loading