diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 957cccc6268e6..4343c11139432 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1618,6 +1618,98 @@ similar but the latter uses generic addressing (see `Generic Addressing `__. + +clusterlaunchcontrol Intrinsics +------------------------------- + +'``llvm.nvvm.clusterlaunchcontrol.try_cancel*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.clusterlaunchcontrol.try_cancel.async.shared(ptr addrspace(3) %addr, ptr addrspace(3) %mbar) + declare void @llvm.nvvm.clusterlaunchcontrol.try_cancel.async.multicast.shared(ptr addrspace(3) %addr, ptr addrspace(3) %mbar) + +Overview: +""""""""" + +The ``clusterlaunchcontrol.try_cancel`` intrinsics requests atomically cancelling +the launch of a cluster that has not started running yet. It asynchronously non-atomically writes +a 16-byte opaque response to shared memory, pointed to by 16-byte-aligned ``addr`` indicating whether the +operation succeeded or failed. ``addr`` and 8-byte-aligned ``mbar`` must refer to ``shared::cta`` +otherwise the behavior is undefined. The completion of the asynchronous operation +is tracked using the mbarrier completion mechanism at ``.cluster`` scope referenced +by the shared memory pointer, ``mbar``. On success, the opaque response contains +the CTA id of the first CTA of the canceled cluster; no other successful response +from other ``clusterlaunchcontrol.try_cancel`` operations from the same grid will +contain that id. + +The ``multicast`` variant specifies that the response is asynchronously non-atomically written to +the corresponding shared memory location of each CTA in the requesting cluster. +The completion of the write of each local response is tracked by independent +mbarriers at the corresponding shared memory location of each CTA in the +cluster. + +For more information, refer `PTX ISA `__. + +'``llvm.nvvm.clusterlaunchcontrol.query_cancel.is_canceled``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i1 @llvm.nvvm.clusterlaunchcontrol.query_cancel.is_canceled(i128 %try_cancel_response) + +Overview: +""""""""" + +The ``llvm.nvvm.clusterlaunchcontrol.query_cancel.is_canceled`` intrinsic decodes the opaque response written by the +``llvm.nvvm.clusterlaunchcontrol.try_cancel`` operation. + +The intrinsic returns ``0`` (false) if the request failed. If the request succeeded, +it returns ``1`` (true). A true result indicates that: + +- the thread block cluster whose first CTA id matches that of the response + handle will not run, and +- no other successful response of another ``try_cancel`` request in the grid will contain + the first CTA id of that cluster + +For more information, refer `PTX ISA `__. + + +'``llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid.x(i128 %try_cancel_response) + declare i32 @llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid.y(i128 %try_cancel_response) + declare i32 @llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid.z(i128 %try_cancel_response) + +Overview: +""""""""" + +The ``clusterlaunchcontrol.query_cancel.get_first_ctaid.*`` intrinsic can be +used to decode the successful opaque response written by the +``llvm.nvvm.clusterlaunchcontrol.try_cancel`` operation. + +If the request succeeded: + +- ``llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid.{x,y,z}`` returns + the coordinate of the first CTA in the canceled cluster, either x, y, or z. + +If the request failed, the behavior of these intrinsics is undefined. + +For more information, refer `PTX ISA `__. + Other Intrinsics ---------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 67c47095076d8..a6335115e0bb2 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -2268,4 +2268,33 @@ let IntrProperties = [IntrArgMemOnly, IntrWriteMem, WriteOnly>, def int_nvvm_st_bulk_shared_cta : DefaultAttrsIntrinsic<[], [llvm_shared_ptr_ty, llvm_i64_ty, llvm_i64_ty]>; } + +// +// clusterlaunchcontorl Intrinsics +// + +// clusterlaunchcontrol.try_cancel + +def int_nvvm_clusterlaunchcontrol_try_cancel_async_shared + : DefaultAttrsIntrinsic<[], [llvm_shared_ptr_ty, llvm_shared_ptr_ty], + [IntrHasSideEffects, IntrArgMemOnly], + "llvm.nvvm.clusterlaunchcontrol.try_cancel.async.shared">; + +def int_nvvm_clusterlaunchcontrol_try_cancel_async_multicast_shared + : DefaultAttrsIntrinsic<[], [llvm_shared_ptr_ty, llvm_shared_ptr_ty], + [IntrHasSideEffects, IntrArgMemOnly], + "llvm.nvvm.clusterlaunchcontrol.try_cancel.async.multicast.shared">; + +// clusterlaunchcontrol.query_cancel.is_canceled + +def int_nvvm_clusterlaunchcontrol_query_cancel_is_canceled + : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i128_ty], [IntrNoMem, IntrSpeculatable], + "llvm.nvvm.clusterlaunchcontrol.query_cancel.is_canceled">; + +foreach dim = ["x", "y", "z"] in { +def int_nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_ # dim + : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i128_ty], [IntrNoMem, IntrSpeculatable], + "llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid." # dim>; +} + } // let TargetPrefix = "nvvm" diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 82d00ef8eccb9..d84d27d5e6e39 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1041,6 +1041,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + // Enable custom lowering for the i128 bit operand with clusterlaunchcontrol + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i128, Custom); } const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -1119,6 +1121,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::BrxEnd) MAKE_CASE(NVPTXISD::BrxItem) MAKE_CASE(NVPTXISD::BrxStart) + MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED) + MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X) + MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y) + MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z) } return nullptr; @@ -2795,12 +2801,57 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) { return Op; } +static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, + SelectionDAG &DAG) { + + SDNode *N = Op.getNode(); + if (N->getOperand(1).getValueType() != MVT::i128) { + // return, if the operand is already lowered + return SDValue(); + } + + unsigned IID = + cast(N->getOperand(0).getNode())->getZExtValue(); + auto Opcode = [&]() { + switch (IID) { + case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled: + return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED; + case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x: + return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X; + case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y: + return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y; + case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z: + return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z; + default: + llvm_unreachable("unsupported/unhandled intrinsic"); + } + }(); + + SDLoc DL(N); + SDValue TryCancelResponse = N->getOperand(1); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse); + SDValue TryCancelResponse0 = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, + DAG.getIntPtrConstant(0, DL)); + SDValue TryCancelResponse1 = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, + DAG.getIntPtrConstant(1, DL)); + + return DAG.getNode(Opcode, DL, N->getVTList(), + {TryCancelResponse0, TryCancelResponse1}); +} + static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) { switch (Op->getConstantOperandVal(0)) { default: return Op; case Intrinsic::nvvm_internal_addrspace_wrap: return Op.getOperand(1); + case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled: + case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x: + case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y: + case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z: + return LowerClusterLaunchControlQueryCancel(Op, DAG); } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 3dff83d74538b..d08af2bb0039b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -79,6 +79,10 @@ enum NodeType : unsigned { BrxStart, BrxItem, BrxEnd, + CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED, + CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X, + CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y, + CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z, Dummy, FIRST_MEMORY_OPCODE, diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 4f8a798295b42..8612f14c68e0d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -7411,3 +7411,59 @@ def INT_NVVM_ST_BULK_SHARED_CTA: "st.bulk.shared::cta [$dest_addr], $size, 0;", [(int_nvvm_st_bulk_shared_cta addr:$dest_addr, i64:$size, (i64 0))]>, Requires<[hasSM<100>, hasPTX<86>]>; + +// +// clusterlaunchcontorl Instructions +// + +def CLUSTERLAUNCHCONTRL_TRY_CANCEL: + NVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar), + "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 " # + "[$addr], [$mbar];", + [(int_nvvm_clusterlaunchcontrol_try_cancel_async_shared addr:$addr, addr:$mbar)]>, + Requires<[hasSM<100>, hasPTX<86>]>; + +def CLUSTERLAUNCHCONTRL_TRY_CANCEL_MULTICAST: + NVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar), + "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes" # + ".multicast::cluster::all.b128 " # + "[$addr], [$mbar];", + [(int_nvvm_clusterlaunchcontrol_try_cancel_async_multicast_shared addr:$addr, addr:$mbar)]>, + Requires<[hasSM<100>, hasArchAccelFeatures, hasPTX<86>]>; + +def SDTClusterLaunchControlQueryCancelIsCanceled: SDTypeProfile<1, 2, []>; +def clusterlaunchcontrol_query_cancel_is_canceled: + SDNode<"NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED", + SDTClusterLaunchControlQueryCancelIsCanceled, []>; + +def CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED: + NVPTXInst<(outs Int1Regs:$pred), (ins Int64Regs:$try_cancel_response0, Int64Regs:$try_cancel_response1), + "{{\n\t" # + ".reg .b128 %clc_handle;\n\t" # + "mov.b128 %clc_handle, {$try_cancel_response0, $try_cancel_response1};\n\t" # + "clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 $pred, %clc_handle;\n\t" # + "}}", [(set i1:$pred, + (clusterlaunchcontrol_query_cancel_is_canceled i64:$try_cancel_response0, i64:$try_cancel_response1))]>, + Requires<[hasSM<100>, hasPTX<86>]>; + +class CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID: + NVPTXInst<(outs Int32Regs:$reg), (ins Int64Regs:$try_cancel_response0, Int64Regs:$try_cancel_response1), + "{{\n\t" # + ".reg .b128 %clc_handle;\n\t" # + "mov.b128 %clc_handle, {$try_cancel_response0, $try_cancel_response1};\n\t" # + "clusterlaunchcontrol.query_cancel.get_first_ctaid::" # Dim # ".b32.b128 $reg, %clc_handle;\n\t" # + "}}", [(set i32:$reg, + (!cast("clusterlaunchcontrol_query_cancel_first_cta_id_" # Dim) + i64:$try_cancel_response0, i64:$try_cancel_response1))]>, + Requires<[hasSM<100>, hasPTX<86>]>; + +foreach dim = ["x", "y", "z"] in { + def SDTClusterLaunchControlQueryCancelGetFirstCtaId # dim: SDTypeProfile<1, 2, []>; + + def clusterlaunchcontrol_query_cancel_first_cta_id_ # dim : + SDNode<"NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_" # !toupper(dim), + !cast("SDTClusterLaunchControlQueryCancelGetFirstCtaId" # dim), []>; + + def CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_ # dim: + CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID; +} diff --git a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll new file mode 100644 index 0000000000000..c8b79dfae760a --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 %s | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} +; RUN: llc -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 %s | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_101a %} +; RUN: llc -o - -mcpu=sm_120a -march=nvptx64 -mattr=+ptx86 %s | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_120a %} + +define void @nvvm_clusterlaunchcontrol_try_cancel_multicast( +; CHECK-PTX-SHARED64-LABEL: nvvm_clusterlaunchcontrol_try_cancel_multicast( +; CHECK-PTX-SHARED64: { +; CHECK-PTX-SHARED64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED64-EMPTY: +; CHECK-PTX-SHARED64-NEXT: // %bb.0: +; CHECK-PTX-SHARED64-NEXT: ld.param.b64 %rd1, [nvvm_clusterlaunchcontrol_try_cancel_multicast_param_0]; +; CHECK-PTX-SHARED64-NEXT: ld.param.b64 %rd2, [nvvm_clusterlaunchcontrol_try_cancel_multicast_param_1]; +; CHECK-PTX-SHARED64-NEXT: clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [%rd1], [%rd2]; +; CHECK-PTX-SHARED64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: nvvm_clusterlaunchcontrol_try_cancel_multicast( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [nvvm_clusterlaunchcontrol_try_cancel_multicast_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [nvvm_clusterlaunchcontrol_try_cancel_multicast_param_1]; +; CHECK-PTX-SHARED32-NEXT: clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [%r1], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + ptr addrspace(3) %saddr, ptr addrspace(3) %smbar, + i128 %try_cancel_response) { + + tail call void @llvm.nvvm.clusterlaunchcontrol.try_cancel.async.multicast.shared(ptr addrspace(3) %saddr, ptr addrspace(3) %smbar) + ret void; +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol.ll b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol.ll new file mode 100644 index 0000000000000..a8ccfc50fbe78 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol.ll @@ -0,0 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100 %} + +define void @nvvm_clusterlaunchcontrol_try_cancel( +; CHECK-PTX-SHARED64-LABEL: nvvm_clusterlaunchcontrol_try_cancel( +; CHECK-PTX-SHARED64: { +; CHECK-PTX-SHARED64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED64-EMPTY: +; CHECK-PTX-SHARED64-NEXT: // %bb.0: +; CHECK-PTX-SHARED64-NEXT: ld.param.b64 %rd1, [nvvm_clusterlaunchcontrol_try_cancel_param_0]; +; CHECK-PTX-SHARED64-NEXT: ld.param.b64 %rd2, [nvvm_clusterlaunchcontrol_try_cancel_param_1]; +; CHECK-PTX-SHARED64-NEXT: clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [%rd1], [%rd2]; +; CHECK-PTX-SHARED64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: nvvm_clusterlaunchcontrol_try_cancel( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [nvvm_clusterlaunchcontrol_try_cancel_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [nvvm_clusterlaunchcontrol_try_cancel_param_1]; +; CHECK-PTX-SHARED32-NEXT: clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [%r1], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + ptr addrspace(3) %saddr, ptr addrspace(3) %smbar, + i128 %try_cancel_response) { + + tail call void @llvm.nvvm.clusterlaunchcontrol.try_cancel.async.shared(ptr addrspace(3) %saddr, ptr addrspace(3) %smbar) + ret void; +} + +define i32 @nvvm_clusterlaunchcontrol_query_cancel_is_canceled(i128 %try_cancel_response) local_unnamed_addr #0 { +; CHECK-LABEL: nvvm_clusterlaunchcontrol_query_cancel_is_canceled( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [nvvm_clusterlaunchcontrol_query_cancel_is_canceled_param_0]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 %clc_handle; +; CHECK-NEXT: mov.b128 %clc_handle, {%rd1, %rd2}; +; CHECK-NEXT: clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 %p1, %clc_handle; +; CHECK-NEXT: } +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %v0 = call i1 @llvm.nvvm.clusterlaunchcontrol.query_cancel.is_canceled(i128 %try_cancel_response) + %v2 = zext i1 %v0 to i32 + ret i32 %v2; +} + +define i32 @nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x(i128 %try_cancel_response) local_unnamed_addr #0 { +; CHECK-LABEL: nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x_param_0]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 %clc_handle; +; CHECK-NEXT: mov.b128 %clc_handle, {%rd1, %rd2}; +; CHECK-NEXT: clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 %r1, %clc_handle; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %v0 = call i32 @llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid.x(i128 %try_cancel_response) + ret i32 %v0; +} + +define i32 @nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y(i128 %try_cancel_response) local_unnamed_addr #0 { +; CHECK-LABEL: nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y_param_0]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 %clc_handle; +; CHECK-NEXT: mov.b128 %clc_handle, {%rd1, %rd2}; +; CHECK-NEXT: clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 %r1, %clc_handle; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %v0 = call i32 @llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid.y(i128 %try_cancel_response) + ret i32 %v0; +} + +define i32 @nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z(i128 %try_cancel_response) local_unnamed_addr #0 { +; CHECK-LABEL: nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z_param_0]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 %clc_handle; +; CHECK-NEXT: mov.b128 %clc_handle, {%rd1, %rd2}; +; CHECK-NEXT: clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 %r1, %clc_handle; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %v0 = call i32 @llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid.z(i128 %try_cancel_response) + ret i32 %v0; +}