diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 2f4b109e8e9e9..9e17adb6ac1ae 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1399,6 +1399,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { if (PTy) { O << "\t.param .u" << PTySizeInBits << " .ptr"; + bool IsCUDA = static_cast(TM).getDrvInterface() == + NVPTX::CUDA; switch (PTy->getAddressSpace()) { default: break; @@ -1406,12 +1408,18 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { O << " .global"; break; case ADDRESS_SPACE_SHARED: + if (IsCUDA) + report_fatal_error(".shared ptr kernel args unsupported in CUDA."); O << " .shared"; break; case ADDRESS_SPACE_CONST: + if (IsCUDA) + report_fatal_error(".const ptr kernel args unsupported in CUDA."); O << " .const"; break; case ADDRESS_SPACE_LOCAL: + if (IsCUDA) + report_fatal_error(".local ptr kernel args unsupported in CUDA."); O << " .local"; break; } diff --git a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll index a56b85de80143..e85ccf34bb6ac 100644 --- a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll +++ b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll @@ -1,5 +1,6 @@ -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas -arch=sm_60 - %} +; RUN: llc < %s -mcpu=sm_60 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_60 | %ptxas -arch=sm_60 - %} +target triple = "nvptx64-nvidia-nvcl" %struct.Large = type { [16 x double] } diff --git a/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll b/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll new file mode 100644 index 0000000000000..7361ab28badb9 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll @@ -0,0 +1,13 @@ +; RUN: not --crash llc < %s -mcpu=sm_75 -o /dev/null 2>&1 | FileCheck %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; Make sure we exit with an error message for this input, as pointers to the +; shared address-space are only supported as kernel args in NVCL, not CUDA. +; CHECK: .shared ptr kernel args unsupported in CUDA. +define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) { + %v = load i32, ptr addrspace(3) %in, align 4 + store i32 %v, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll b/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll new file mode 100644 index 0000000000000..44b44e0c17626 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll @@ -0,0 +1,17 @@ +; RUN: opt < %s -S -nvptx-lower-args | FileCheck %s --check-prefixes COMMON,IR +; RUN: llc < %s -mcpu=sm_20 | FileCheck %s --check-prefixes COMMON,PTX +; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %} + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-nvcl" + +; COMMON-LABEL: ptr_nongeneric +define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) { +; IR-NOT: addrspacecast +; PTX-NOT: cvta.to.global +; PTX: ld.shared.u32 +; PTX st.global.u32 + %v = load i32, ptr addrspace(3) %in, align 4 + store i32 %v, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index 8e879871e295b..44445a17d1eb3 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -140,29 +140,6 @@ define ptx_kernel void @ptr_generic(ptr %out, ptr %in) { ret void } -define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) { -; IR-LABEL: define ptx_kernel void @ptr_nongeneric( -; IR-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) { -; IR-NEXT: [[V:%.*]] = load i32, ptr addrspace(3) [[IN]], align 4 -; IR-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 -; IR-NEXT: ret void -; -; PTX-LABEL: ptr_nongeneric( -; PTX: { -; PTX-NEXT: .reg .b32 %r<2>; -; PTX-NEXT: .reg .b64 %rd<3>; -; PTX-EMPTY: -; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ptr_nongeneric_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ptr_nongeneric_param_1]; -; PTX-NEXT: ld.shared.u32 %r1, [%rd2]; -; PTX-NEXT: st.global.u32 [%rd1], %r1; -; PTX-NEXT: ret; - %v = load i32, ptr addrspace(3) %in, align 4 - store i32 %v, ptr addrspace(1) %out, align 4 - ret void -} - define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) { ; IRC-LABEL: define ptx_kernel void @ptr_as_int( ; IRC-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) {