diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index c1426823d87af..b6222300e4d4a 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -568,6 +568,36 @@ to left-shift the found bit into the most-significant bit position, otherwise the result is the shift amount needed to right-shift the found bit into the least-significant bit position. 0xffffffff is returned if no 1 bit is found. +'``llvm.nvvm.{zext,sext}.{wrap,clamp}``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.zext.wrap(i32 %a, i32 %b) + declare i32 @llvm.nvvm.zext.clamp(i32 %a, i32 %b) + declare i32 @llvm.nvvm.sext.wrap(i32 %a, i32 %b) + declare i32 @llvm.nvvm.sext.clamp(i32 %a, i32 %b) + +Overview: +""""""""" + +The '``llvm.nvvm.{zext,sext}.{wrap,clamp}``' family of intrinsics extracts the +low bits of the input value, and zero- or sign-extends them back to the original +width. + +Semantics: +"""""""""" + +The '``llvm.nvvm.{zext,sext}.{wrap,clamp}``' family of intrinsics returns +extension of N lowest bits of operand %a. For the '``wrap``' variants, N is the +value of operand %b modulo 32. For the '``clamp``' variants, N is the value of +operand %b clamped to the range [0, 32]. The N lowest bits are then +zero-extended the case of the '``zext``' variants, or sign-extended the case of +the '``sext``' variants. If N is 0, the result is 0. + TMA family of Intrinsics ------------------------ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 8b87822d3fdda..2851206f2e84a 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1356,6 +1356,17 @@ let TargetPrefix = "nvvm" in { [llvm_anyint_ty, llvm_i1_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg>]>; + +// +// szext +// + foreach ext = ["sext", "zext"] in + foreach mode = ["wrap", "clamp"] in + def int_nvvm_ # ext # _ # mode : + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; + // // Convert // diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 11d77599d4ac3..a384cb79d645a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -227,6 +227,7 @@ class RegTyInfo; def I16RT : RegTyInfo; def I32RT : RegTyInfo; def I64RT : RegTyInfo; @@ -240,26 +241,33 @@ def F16X2RT : RegTyInfo; def BF16X2RT : RegTyInfo; +multiclass I3Inst requires = []> { + defvar asmstr = op_str # " \t$dst, $a, $b;"; + + def rr : + NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b), + asmstr, + [(set t.Ty:$dst, (op_node t.Ty:$a, t.Ty:$b))]>, + Requires; + def ri : + NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b), + asmstr, + [(set t.Ty:$dst, (op_node t.Ty:$a, (t.Ty imm:$b)))]>, + Requires; + if !not(commutative) then + def ir : + NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b), + asmstr, + [(set t.Ty:$dst, (op_node (t.Ty imm:$a), t.Ty:$b))]>, + Requires; +} + // Template for instructions which take three int64, int32, or int16 args. // The instructions are named "" (e.g. "add.s64"). -multiclass I3 { - foreach t = [I16RT, I32RT, I64RT] in { - defvar asmstr = OpcStr # t.Size # " \t$dst, $a, $b;"; - - def t.Ty # rr : - NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b), - asmstr, - [(set t.Ty:$dst, (OpNode t.Ty:$a, t.Ty:$b))]>; - def t.Ty # ri : - NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b), - asmstr, - [(set t.Ty:$dst, (OpNode t.RC:$a, imm:$b))]>; - if !not(commutative) then - def t.Ty # ir : - NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b), - asmstr, - [(set t.Ty:$dst, (OpNode imm:$a, t.RC:$b))]>; - } +multiclass I3 { + foreach t = [I16RT, I32RT, I64RT] in + defm t.Ty# : I3Inst; } class I16x2 : @@ -270,26 +278,11 @@ class I16x2 : // Template for instructions which take 3 int args. The instructions are // named ".s32" (e.g. "addc.cc.s32"). -multiclass ADD_SUB_INT_CARRY { +multiclass ADD_SUB_INT_CARRY { let hasSideEffects = 1 in { - def i32rr : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), - [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; - def i32ri : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), - [(set i32:$dst, (OpNode i32:$a, imm:$b))]>; - def i64rr : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), - [(set i64:$dst, (OpNode i64:$a, i64:$b))]>, - Requires<[hasPTX<43>]>; - def i64ri : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), - [(set i64:$dst, (OpNode i64:$a, imm:$b))]>, - Requires<[hasPTX<43>]>; + defm i32 : I3Inst; + defm i64 : I3Inst]>; } } @@ -841,31 +834,31 @@ defm SUB_i1 : ADD_SUB_i1; // int16, int32, and int64 signed addition. Since nvptx is 2's complement, we // also use these for unsigned arithmetic. -defm ADD : I3<"add.s", add, /*commutative=*/ true>; -defm SUB : I3<"sub.s", sub, /*commutative=*/ false>; +defm ADD : I3<"add.s", add, commutative = true>; +defm SUB : I3<"sub.s", sub, commutative = false>; def ADD16x2 : I16x2<"add.s", add>; // in32 and int64 addition and subtraction with carry-out. -defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>; -defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; +defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc, commutative = true>; +defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc, commutative = false>; // int32 and int64 addition and subtraction with carry-in and carry-out. -defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>; -defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>; +defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde, commutative = true>; +defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube, commutative = false>; -defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>; +defm MULT : I3<"mul.lo.s", mul, commutative = true>; -defm MULTHS : I3<"mul.hi.s", mulhs, /*commutative=*/ true>; -defm MULTHU : I3<"mul.hi.u", mulhu, /*commutative=*/ true>; +defm MULTHS : I3<"mul.hi.s", mulhs, commutative = true>; +defm MULTHU : I3<"mul.hi.u", mulhu, commutative = true>; -defm SDIV : I3<"div.s", sdiv, /*commutative=*/ false>; -defm UDIV : I3<"div.u", udiv, /*commutative=*/ false>; +defm SDIV : I3<"div.s", sdiv, commutative = false>; +defm UDIV : I3<"div.u", udiv, commutative = false>; // The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM // will lower it. -defm SREM : I3<"rem.s", srem, /*commutative=*/ false>; -defm UREM : I3<"rem.u", urem, /*commutative=*/ false>; +defm SREM : I3<"rem.s", srem, commutative = false>; +defm UREM : I3<"rem.u", urem, commutative = false>; // Integer absolute value. NumBits should be one minus the bit width of RC. // This idiom implements the algorithm at @@ -880,10 +873,10 @@ defm ABS_32 : ABS; defm ABS_64 : ABS; // Integer min/max. -defm SMAX : I3<"max.s", smax, /*commutative=*/ true>; -defm UMAX : I3<"max.u", umax, /*commutative=*/ true>; -defm SMIN : I3<"min.s", smin, /*commutative=*/ true>; -defm UMIN : I3<"min.u", umin, /*commutative=*/ true>; +defm SMAX : I3<"max.s", smax, commutative = true>; +defm UMAX : I3<"max.u", umax, commutative = true>; +defm SMIN : I3<"min.s", smin, commutative = true>; +defm UMIN : I3<"min.u", umin, commutative = true>; def SMAX16x2 : I16x2<"max.s", smax>; def UMAX16x2 : I16x2<"max.u", umax>; @@ -1393,38 +1386,10 @@ def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), // Template for three-arg bitwise operations. Takes three args, Creates .b16, // .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. multiclass BITWISE { - def b1rr : - NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), - !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), - [(set i1:$dst, (OpNode i1:$a, i1:$b))]>; - def b1ri : - NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), - !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), - [(set i1:$dst, (OpNode i1:$a, imm:$b))]>; - def b16rr : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), - [(set i16:$dst, (OpNode i16:$a, i16:$b))]>; - def b16ri : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), - [(set i16:$dst, (OpNode i16:$a, imm:$b))]>; - def b32rr : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), - [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; - def b32ri : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), - [(set i32:$dst, (OpNode i32:$a, imm:$b))]>; - def b64rr : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), - [(set i64:$dst, (OpNode i64:$a, i64:$b))]>; - def b64ri : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), - [(set i64:$dst, (OpNode i64:$a, imm:$b))]>; + defm b1 : I3Inst; + defm b16 : I3Inst; + defm b32 : I3Inst; + defm b64 : I3Inst; } defm OR : BITWISE<"or", or>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 3eedb43e4c81a..7b139d7b79e7d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1678,6 +1678,21 @@ foreach t = [I32RT, I64RT] in { } } +// +// szext +// + +foreach sign = ["s", "u"] in { + foreach mode = ["wrap", "clamp"] in { + defvar ext = !if(!eq(sign, "s"), "sext", "zext"); + defvar intrin = !cast("int_nvvm_" # ext # "_" # mode); + defm SZEXT_ # sign # _ # mode + : I3Inst<"szext." # mode # "." # sign # "32", + intrin, I32RT, commutative = false, + requires = [hasSM<70>, hasPTX<76>]>; + } +} + // // Convert // diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index 64786e601c4b5..f1ca19b30ac2a 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -13,37 +13,37 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0]; ; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1]; ; CHECK-NEXT: shr.s64 %rd2, %rd46, 63; -; CHECK-NEXT: mov.b64 %rd117, 0; -; CHECK-NEXT: sub.cc.s64 %rd52, %rd117, %rd45; -; CHECK-NEXT: subc.cc.s64 %rd53, %rd117, %rd46; +; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45; +; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46; ; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; -; CHECK-NEXT: selp.b64 %rd4, %rd53, %rd46, %p1; -; CHECK-NEXT: selp.b64 %rd3, %rd52, %rd45, %p1; -; CHECK-NEXT: sub.cc.s64 %rd54, %rd117, %rd49; -; CHECK-NEXT: subc.cc.s64 %rd55, %rd117, %rd50; +; CHECK-NEXT: selp.b64 %rd4, %rd52, %rd46, %p1; +; CHECK-NEXT: selp.b64 %rd3, %rd51, %rd45, %p1; +; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49; +; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50; ; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0; -; CHECK-NEXT: selp.b64 %rd6, %rd55, %rd50, %p2; -; CHECK-NEXT: selp.b64 %rd5, %rd54, %rd49, %p2; -; CHECK-NEXT: or.b64 %rd56, %rd5, %rd6; -; CHECK-NEXT: setp.eq.s64 %p3, %rd56, 0; -; CHECK-NEXT: or.b64 %rd57, %rd3, %rd4; -; CHECK-NEXT: setp.eq.s64 %p4, %rd57, 0; +; CHECK-NEXT: selp.b64 %rd6, %rd54, %rd50, %p2; +; CHECK-NEXT: selp.b64 %rd5, %rd53, %rd49, %p2; +; CHECK-NEXT: or.b64 %rd55, %rd5, %rd6; +; CHECK-NEXT: setp.eq.s64 %p3, %rd55, 0; +; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4; +; CHECK-NEXT: setp.eq.s64 %p4, %rd56, 0; ; CHECK-NEXT: or.pred %p5, %p3, %p4; ; CHECK-NEXT: setp.ne.s64 %p6, %rd6, 0; ; CHECK-NEXT: clz.b64 %r1, %rd6; -; CHECK-NEXT: cvt.u64.u32 %rd58, %r1; +; CHECK-NEXT: cvt.u64.u32 %rd57, %r1; ; CHECK-NEXT: clz.b64 %r2, %rd5; -; CHECK-NEXT: cvt.u64.u32 %rd59, %r2; -; CHECK-NEXT: add.s64 %rd60, %rd59, 64; -; CHECK-NEXT: selp.b64 %rd61, %rd58, %rd60, %p6; +; CHECK-NEXT: cvt.u64.u32 %rd58, %r2; +; CHECK-NEXT: add.s64 %rd59, %rd58, 64; +; CHECK-NEXT: selp.b64 %rd60, %rd57, %rd59, %p6; ; CHECK-NEXT: setp.ne.s64 %p7, %rd4, 0; ; CHECK-NEXT: clz.b64 %r3, %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd62, %r3; +; CHECK-NEXT: cvt.u64.u32 %rd61, %r3; ; CHECK-NEXT: clz.b64 %r4, %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd63, %r4; -; CHECK-NEXT: add.s64 %rd64, %rd63, 64; -; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7; -; CHECK-NEXT: sub.cc.s64 %rd66, %rd61, %rd65; +; CHECK-NEXT: cvt.u64.u32 %rd62, %r4; +; CHECK-NEXT: add.s64 %rd63, %rd62, 64; +; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7; +; CHECK-NEXT: mov.b64 %rd117, 0; +; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64; ; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0; ; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127; ; CHECK-NEXT: setp.eq.s64 %p9, %rd67, 0; @@ -314,39 +314,39 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0]; ; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1]; -; CHECK-NEXT: mov.b64 %rd112, 0; -; CHECK-NEXT: sub.cc.s64 %rd52, %rd112, %rd45; -; CHECK-NEXT: subc.cc.s64 %rd53, %rd112, %rd46; +; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45; +; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46; ; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; -; CHECK-NEXT: selp.b64 %rd2, %rd53, %rd46, %p1; -; CHECK-NEXT: selp.b64 %rd1, %rd52, %rd45, %p1; -; CHECK-NEXT: sub.cc.s64 %rd54, %rd112, %rd49; -; CHECK-NEXT: subc.cc.s64 %rd55, %rd112, %rd50; +; CHECK-NEXT: selp.b64 %rd2, %rd52, %rd46, %p1; +; CHECK-NEXT: selp.b64 %rd1, %rd51, %rd45, %p1; +; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49; +; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50; ; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0; -; CHECK-NEXT: selp.b64 %rd4, %rd55, %rd50, %p2; -; CHECK-NEXT: selp.b64 %rd3, %rd54, %rd49, %p2; -; CHECK-NEXT: xor.b64 %rd56, %rd50, %rd46; -; CHECK-NEXT: shr.s64 %rd5, %rd56, 63; -; CHECK-NEXT: or.b64 %rd57, %rd3, %rd4; -; CHECK-NEXT: setp.eq.s64 %p3, %rd57, 0; -; CHECK-NEXT: or.b64 %rd58, %rd1, %rd2; -; CHECK-NEXT: setp.eq.s64 %p4, %rd58, 0; +; CHECK-NEXT: selp.b64 %rd4, %rd54, %rd50, %p2; +; CHECK-NEXT: selp.b64 %rd3, %rd53, %rd49, %p2; +; CHECK-NEXT: xor.b64 %rd55, %rd50, %rd46; +; CHECK-NEXT: shr.s64 %rd5, %rd55, 63; +; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4; +; CHECK-NEXT: setp.eq.s64 %p3, %rd56, 0; +; CHECK-NEXT: or.b64 %rd57, %rd1, %rd2; +; CHECK-NEXT: setp.eq.s64 %p4, %rd57, 0; ; CHECK-NEXT: or.pred %p5, %p3, %p4; ; CHECK-NEXT: setp.ne.s64 %p6, %rd4, 0; ; CHECK-NEXT: clz.b64 %r1, %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd59, %r1; +; CHECK-NEXT: cvt.u64.u32 %rd58, %r1; ; CHECK-NEXT: clz.b64 %r2, %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd60, %r2; -; CHECK-NEXT: add.s64 %rd61, %rd60, 64; -; CHECK-NEXT: selp.b64 %rd62, %rd59, %rd61, %p6; +; CHECK-NEXT: cvt.u64.u32 %rd59, %r2; +; CHECK-NEXT: add.s64 %rd60, %rd59, 64; +; CHECK-NEXT: selp.b64 %rd61, %rd58, %rd60, %p6; ; CHECK-NEXT: setp.ne.s64 %p7, %rd2, 0; ; CHECK-NEXT: clz.b64 %r3, %rd2; -; CHECK-NEXT: cvt.u64.u32 %rd63, %r3; +; CHECK-NEXT: cvt.u64.u32 %rd62, %r3; ; CHECK-NEXT: clz.b64 %r4, %rd1; -; CHECK-NEXT: cvt.u64.u32 %rd64, %r4; -; CHECK-NEXT: add.s64 %rd65, %rd64, 64; -; CHECK-NEXT: selp.b64 %rd66, %rd63, %rd65, %p7; -; CHECK-NEXT: sub.cc.s64 %rd67, %rd62, %rd66; +; CHECK-NEXT: cvt.u64.u32 %rd63, %r4; +; CHECK-NEXT: add.s64 %rd64, %rd63, 64; +; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7; +; CHECK-NEXT: mov.b64 %rd112, 0; +; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65; ; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0; ; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127; ; CHECK-NEXT: setp.eq.s64 %p9, %rd68, 0; diff --git a/llvm/test/CodeGen/NVPTX/szext.ll b/llvm/test/CodeGen/NVPTX/szext.ll new file mode 100644 index 0000000000000..f159156c6b80f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/szext.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %} + +target triple = "nvptx64-unknown-cuda" + +define i32 @szext_wrap_u32(i32 %a, i32 %b) { +; CHECK-LABEL: szext_wrap_u32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [szext_wrap_u32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [szext_wrap_u32_param_1]; +; CHECK-NEXT: szext.wrap.u32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.zext.wrap(i32 %a, i32 %b) + ret i32 %c +} + +define i32 @szext_clamp_u32(i32 %a, i32 %b) { +; CHECK-LABEL: szext_clamp_u32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [szext_clamp_u32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [szext_clamp_u32_param_1]; +; CHECK-NEXT: szext.clamp.u32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.zext.clamp(i32 %a, i32 %b) + ret i32 %c +} + +define i32 @szext_wrap_s32(i32 %a, i32 %b) { +; CHECK-LABEL: szext_wrap_s32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [szext_wrap_s32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [szext_wrap_s32_param_1]; +; CHECK-NEXT: szext.wrap.s32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.sext.wrap(i32 %a, i32 %b) + ret i32 %c +} + +define i32 @szext_clamp_s32(i32 %a, i32 %b) { +; CHECK-LABEL: szext_clamp_s32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [szext_clamp_s32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [szext_clamp_s32_param_1]; +; CHECK-NEXT: szext.clamp.s32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.sext.clamp(i32 %a, i32 %b) + ret i32 %c +} + +define i32 @szext_clamp_s32_ii() { +; CHECK-LABEL: szext_clamp_s32_ii( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b32 %r1, 3; +; CHECK-NEXT: szext.clamp.s32 %r2, %r1, 4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.sext.clamp(i32 3, i32 4) + ret i32 %c +} + +define i32 @szext_wrap_s32_ir(i32 %a) { +; CHECK-LABEL: szext_wrap_s32_ir( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [szext_wrap_s32_ir_param_0]; +; CHECK-NEXT: szext.wrap.s32 %r2, 5, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.sext.wrap(i32 5, i32 %a) + ret i32 %c +} + +define i32 @szext_clamp_u32_ri(i32 %a) { +; CHECK-LABEL: szext_clamp_u32_ri( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [szext_clamp_u32_ri_param_0]; +; CHECK-NEXT: szext.clamp.u32 %r2, %r1, 7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %c = call i32 @llvm.nvvm.zext.clamp(i32 %a, i32 7) + ret i32 %c +}