-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[NVPTX] Add errors for incorrect CUDA addrpaces #138706
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
The CUDA API only accepts kernel params in the global and generic address spaces, so display an error message when attempting to emit pointers outside those address-spaces from CUDA (but still allow them for OpenCL).
@llvm/pr-subscribers-backend-nvptx Author: Lewis Crawford (LewisCrawford) ChangesThe CUDA API only accepts kernel params in the global and generic address spaces, so display an error message when attempting to emit pointers outside those address-spaces from CUDA (but still allow them for OpenCL). Full diff: https://github.com/llvm/llvm-project/pull/138706.diff 5 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 2f4b109e8e9e9..9e17adb6ac1ae 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1399,6 +1399,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
if (PTy) {
O << "\t.param .u" << PTySizeInBits << " .ptr";
+ bool IsCUDA = static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() ==
+ NVPTX::CUDA;
switch (PTy->getAddressSpace()) {
default:
break;
@@ -1406,12 +1408,18 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
O << " .global";
break;
case ADDRESS_SPACE_SHARED:
+ if (IsCUDA)
+ report_fatal_error(".shared ptr kernel args unsupported in CUDA.");
O << " .shared";
break;
case ADDRESS_SPACE_CONST:
+ if (IsCUDA)
+ report_fatal_error(".const ptr kernel args unsupported in CUDA.");
O << " .const";
break;
case ADDRESS_SPACE_LOCAL:
+ if (IsCUDA)
+ report_fatal_error(".local ptr kernel args unsupported in CUDA.");
O << " .local";
break;
}
diff --git a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
index a56b85de80143..e85ccf34bb6ac 100644
--- a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas -arch=sm_60 - %}
+; RUN: llc < %s -mcpu=sm_60 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_60 | %ptxas -arch=sm_60 - %}
+target triple = "nvptx64-nvidia-nvcl"
%struct.Large = type { [16 x double] }
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll b/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll
new file mode 100644
index 0000000000000..7361ab28badb9
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll
@@ -0,0 +1,13 @@
+; RUN: not --crash llc < %s -mcpu=sm_75 -o /dev/null 2>&1 | FileCheck %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; Make sure we exit with an error message for this input, as pointers to the
+; shared address-space are only supported as kernel args in NVCL, not CUDA.
+; CHECK: .shared ptr kernel args unsupported in CUDA.
+define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+ %v = load i32, ptr addrspace(3) %in, align 4
+ store i32 %v, ptr addrspace(1) %out, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll b/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll
new file mode 100644
index 0000000000000..44b44e0c17626
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -S -nvptx-lower-args | FileCheck %s --check-prefixes COMMON,IR
+; RUN: llc < %s -mcpu=sm_20 | FileCheck %s --check-prefixes COMMON,PTX
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-nvcl"
+
+; COMMON-LABEL: ptr_nongeneric
+define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; IR-NOT: addrspacecast
+; PTX-NOT: cvta.to.global
+; PTX: ld.shared.u32
+; PTX st.global.u32
+ %v = load i32, ptr addrspace(3) %in, align 4
+ store i32 %v, ptr addrspace(1) %out, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 8e879871e295b..44445a17d1eb3 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -140,29 +140,6 @@ define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
ret void
}
-define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) {
-; IR-LABEL: define ptx_kernel void @ptr_nongeneric(
-; IR-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
-; IR-NEXT: [[V:%.*]] = load i32, ptr addrspace(3) [[IN]], align 4
-; IR-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
-; IR-NEXT: ret void
-;
-; PTX-LABEL: ptr_nongeneric(
-; PTX: {
-; PTX-NEXT: .reg .b32 %r<2>;
-; PTX-NEXT: .reg .b64 %rd<3>;
-; PTX-EMPTY:
-; PTX-NEXT: // %bb.0:
-; PTX-NEXT: ld.param.u64 %rd1, [ptr_nongeneric_param_0];
-; PTX-NEXT: ld.param.u64 %rd2, [ptr_nongeneric_param_1];
-; PTX-NEXT: ld.shared.u32 %r1, [%rd2];
-; PTX-NEXT: st.global.u32 [%rd1], %r1;
-; PTX-NEXT: ret;
- %v = load i32, ptr addrspace(3) %in, align 4
- store i32 %v, ptr addrspace(1) %out, align 4
- ret void
-}
-
define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
; IRC-LABEL: define ptx_kernel void @ptr_as_int(
; IRC-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) {
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thank you Lewis.
One thing I wonder is whether this sort of logic really belongs in assembly printing. It seems like this verification could happen at any point during compilation, such as during an IR pass. Are there examples from other targets of target-specific IR verification? Do you have any thoughts about alternative locations for this verification? |
if (IsCUDA) | ||
report_fatal_error(".const ptr kernel args unsupported in CUDA."); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While local/shared pointers have no meaning outside of thread/CTA. .const
pointers do exist and can be passed around.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For kernel parameters?
We have .param::grid_constant
for __grid_constant__
kernel parameters, but that's different than .const
.
The CUDA API only accepts kernel params in the global and generic address spaces, so display an error message when attempting to emit pointers outside those address-spaces from CUDA (but still allow them for OpenCL).