diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt index 1cffde138eab7..693f0d0b35edc 100644 --- a/llvm/lib/Target/NVPTX/CMakeLists.txt +++ b/llvm/lib/Target/NVPTX/CMakeLists.txt @@ -13,34 +13,35 @@ add_public_tablegen_target(NVPTXCommonTableGen) set(NVPTXCodeGen_sources NVPTXAliasAnalysis.cpp NVPTXAllocaHoisting.cpp - NVPTXAtomicLower.cpp NVPTXAsmPrinter.cpp NVPTXAssignValidGlobalNames.cpp + NVPTXAtomicLower.cpp + NVPTXCtorDtorLowering.cpp NVPTXForwardParams.cpp NVPTXFrameLowering.cpp NVPTXGenericToNVVM.cpp - NVPTXISelDAGToDAG.cpp - NVPTXISelLowering.cpp NVPTXImageOptimizer.cpp NVPTXInstrInfo.cpp + NVPTXISelDAGToDAG.cpp + NVPTXISelLowering.cpp NVPTXLowerAggrCopies.cpp - NVPTXLowerArgs.cpp NVPTXLowerAlloca.cpp + NVPTXLowerArgs.cpp NVPTXLowerUnreachable.cpp - NVPTXPeephole.cpp NVPTXMCExpr.cpp + NVPTXPeephole.cpp NVPTXPrologEpilogPass.cpp + NVPTXProxyRegErasure.cpp NVPTXRegisterInfo.cpp NVPTXReplaceImageHandles.cpp NVPTXSelectionDAGInfo.cpp NVPTXSubtarget.cpp + NVPTXTagInvariantLoads.cpp NVPTXTargetMachine.cpp NVPTXTargetTransformInfo.cpp NVPTXUtilities.cpp NVVMIntrRange.cpp NVVMReflect.cpp - NVPTXProxyRegErasure.cpp - NVPTXCtorDtorLowering.cpp ) add_llvm_target(NVPTXCodeGen diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index cf21ad991ccdf..1da979d023b42 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -51,6 +51,7 @@ FunctionPass *createNVPTXLowerArgsPass(); FunctionPass *createNVPTXLowerAllocaPass(); FunctionPass *createNVPTXLowerUnreachablePass(bool TrapUnreachable, bool NoTrapAfterNoreturn); +FunctionPass *createNVPTXTagInvariantLoadsPass(); MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); MachineFunctionPass *createNVPTXForwardParamsPass(); @@ -73,6 +74,7 @@ void initializeNVVMReflectPass(PassRegistry &); void initializeNVPTXAAWrapperPassPass(PassRegistry &); void initializeNVPTXExternalAAWrapperPass(PassRegistry &); void initializeNVPTXPeepholePass(PassRegistry &); +void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &); struct NVVMIntrRangePass : PassInfoMixin { PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); @@ -104,6 +106,10 @@ struct NVPTXLowerArgsPass : PassInfoMixin { PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; +struct NVPTXTagInvariantLoadsPass : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + namespace NVPTX { enum DrvInterface { NVCL, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 295ed666a1902..7d83f8833075c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -766,46 +766,12 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N, llvm_unreachable("unhandled ordering"); } -static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, - unsigned CodeAddrSpace, MachineFunction *F) { +static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, + unsigned CodeAddrSpace) { // We use ldg (i.e. ld.global.nc) for invariant loads from the global address // space. - // - // We have two ways of identifying invariant loads: Loads may be explicitly - // marked as invariant, or we may infer them to be invariant. - // - // We currently infer invariance for loads from - // - constant global variables, and - // - kernel function pointer params that are noalias (i.e. __restrict) and - // never written to. - // - // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally - // not during the SelectionDAG phase). - // - // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for - // explicitly invariant loads because these are how clang tells us to use ldg - // when the user uses a builtin. - if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::AddressSpace::Global) - return false; - - if (N->isInvariant()) - return true; - - bool IsKernelFn = isKernelFunction(F->getFunction()); - - // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly - // because the former looks through phi nodes while the latter does not. We - // need to look through phi nodes to handle pointer induction variables. - SmallVector Objs; - getUnderlyingObjects(N->getMemOperand()->getValue(), Objs); - - return all_of(Objs, [&](const Value *V) { - if (auto *A = dyn_cast(V)) - return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr(); - if (auto *GV = dyn_cast(V)) - return GV->isConstant(); - return false; - }); + return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global && + N.isInvariant(); } static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, @@ -1106,10 +1072,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { return false; // Address Space Setting - unsigned int CodeAddrSpace = getCodeAddrSpace(LD); - if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { + const unsigned CodeAddrSpace = getCodeAddrSpace(LD); + if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace)) return tryLDGLDU(N); - } SDLoc DL(N); SDValue Chain = N->getOperand(0); @@ -1192,10 +1157,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { const MVT MemVT = MemEVT.getSimpleVT(); // Address Space Setting - unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD); - if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { + const unsigned CodeAddrSpace = getCodeAddrSpace(MemSD); + if (canLowerToLDG(*MemSD, *Subtarget, CodeAddrSpace)) return tryLDGLDU(N); - } EVT EltVT = N->getValueType(0); SDLoc DL(N); diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def index 1c813c2c51f70..ee37c9826012c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def +++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def @@ -38,5 +38,6 @@ FUNCTION_ALIAS_ANALYSIS("nvptx-aa", NVPTXAA()) #endif FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass()) FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass()) -FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this)); +FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this)) +FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass()) #undef FUNCTION_PASS diff --git a/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp b/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp new file mode 100644 index 0000000000000..a4aff44ac04f6 --- /dev/null +++ b/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp @@ -0,0 +1,104 @@ +//===------ NVPTXTagInvariantLoads.cpp - Tag invariant loads --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements invaraint load tagging. It traverses load instructions +// in a function, and determines if each load can be tagged as invariant. +// +// We currently infer invariance for loads from +// - constant global variables, and +// - kernel function pointer params that are noalias (i.e. __restrict) and +// never written to. +// +// TODO: Perform a more powerful invariance analysis (ideally IPO). +// +//===----------------------------------------------------------------------===// + +#include "NVPTXUtilities.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Support/NVPTXAddrSpace.h" + +using namespace llvm; + +static bool isInvariantLoad(const LoadInst *LI, const bool IsKernelFn) { + // Don't bother with non-global loads + if (LI->getPointerAddressSpace() != NVPTXAS::ADDRESS_SPACE_GLOBAL) + return false; + + // If the load is already marked as invariant, we don't need to do anything + if (LI->getMetadata(LLVMContext::MD_invariant_load)) + return false; + + // We use getUnderlyingObjects() here instead of getUnderlyingObject() + // mainly because the former looks through phi nodes while the latter does + // not. We need to look through phi nodes to handle pointer induction + // variables. + SmallVector Objs; + getUnderlyingObjects(LI->getPointerOperand(), Objs); + + return all_of(Objs, [&](const Value *V) { + if (const auto *A = dyn_cast(V)) + return IsKernelFn && ((A->onlyReadsMemory() && A->hasNoAliasAttr()) || + isParamGridConstant(*A)); + if (const auto *GV = dyn_cast(V)) + return GV->isConstant(); + return false; + }); +} + +static void markLoadsAsInvariant(LoadInst *LI) { + LI->setMetadata(LLVMContext::MD_invariant_load, + MDNode::get(LI->getContext(), {})); +} + +static bool tagInvariantLoads(Function &F) { + const bool IsKernelFn = isKernelFunction(F); + + bool Changed = false; + for (auto &I : instructions(F)) { + if (auto *LI = dyn_cast(&I)) { + if (isInvariantLoad(LI, IsKernelFn)) { + markLoadsAsInvariant(LI); + Changed = true; + } + } + } + return Changed; +} + +namespace { + +struct NVPTXTagInvariantLoadLegacyPass : public FunctionPass { + static char ID; + + NVPTXTagInvariantLoadLegacyPass() : FunctionPass(ID) {} + bool runOnFunction(Function &F) override; +}; + +} // namespace + +INITIALIZE_PASS(NVPTXTagInvariantLoadLegacyPass, "nvptx-tag-invariant-loads", + "NVPTX Tag Invariant Loads", false, false) + +bool NVPTXTagInvariantLoadLegacyPass::runOnFunction(Function &F) { + return tagInvariantLoads(F); +} + +char NVPTXTagInvariantLoadLegacyPass::ID = 0; + +FunctionPass *llvm::createNVPTXTagInvariantLoadsPass() { + return new NVPTXTagInvariantLoadLegacyPass(); +} + +PreservedAnalyses NVPTXTagInvariantLoadsPass::run(Function &F, + FunctionAnalysisManager &) { + return tagInvariantLoads(F) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index f78d4585bbe98..dc3afc1f4a17d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -112,6 +112,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { initializeNVPTXAAWrapperPassPass(PR); initializeNVPTXExternalAAWrapperPass(PR); initializeNVPTXPeepholePass(PR); + initializeNVPTXTagInvariantLoadLegacyPassPass(PR); } static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { @@ -395,6 +396,7 @@ void NVPTXPassConfig::addIRPasses() { if (!DisableLoadStoreVectorizer) addPass(createLoadStoreVectorizerPass()); addPass(createSROAPass()); + addPass(createNVPTXTagInvariantLoadsPass()); } if (ST.hasPTXASUnreachableBug()) { diff --git a/llvm/test/CodeGen/NVPTX/byval-const-global.ll b/llvm/test/CodeGen/NVPTX/byval-const-global.ll new file mode 100644 index 0000000000000..cce317a52299c --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/byval-const-global.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_70 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 | %ptxas-verify %} + +target triple = "nvptx64-nvidia-cuda" + +%struct = type { [2 x i64] } +@G = external constant %struct + +define void @foo() { +; CHECK-LABEL: foo( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.global.u64 %rd1, [G]; +; CHECK-NEXT: ld.global.u64 %rd2, [G+8]; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[16]; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: st.param.b64 [param0+8], %rd2; +; CHECK-NEXT: call.uni +; CHECK-NEXT: bar, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: ret; + call void @bar(ptr byval(%struct) @G) + ret void +} + +declare void @bar(ptr) diff --git a/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll b/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll new file mode 100644 index 0000000000000..26967faa01a1b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=nvptx-tag-invariant-loads < %s -mcpu=sm_80 | FileCheck %s --check-prefix=OPT +; RUN: llc -o - < %s -mcpu=sm_80 | FileCheck %s --check-prefix=PTX + +target triple = "nvptx-unknown-cuda" + +define ptx_kernel void @basic(ptr noalias readonly %a, ptr %out) { +; OPT-LABEL: define ptx_kernel void @basic( +; OPT-SAME: ptr noalias readonly [[A:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[A_GLOBAL:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) +; OPT-NEXT: [[VAL:%.*]] = load float, ptr addrspace(1) [[A_GLOBAL]], align 4, !invariant.load [[META0:![0-9]+]] +; OPT-NEXT: store float [[VAL]], ptr [[OUT]], align 4 +; OPT-NEXT: ret void +; +; PTX-LABEL: basic( +; PTX: { +; PTX-NEXT: .reg .b32 %r<5>; +; PTX-NEXT: .reg .b32 %f<2>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u32 %r1, [basic_param_0]; +; PTX-NEXT: cvta.to.global.u32 %r2, %r1; +; PTX-NEXT: ld.param.u32 %r3, [basic_param_1]; +; PTX-NEXT: cvta.to.global.u32 %r4, %r3; +; PTX-NEXT: ld.global.nc.f32 %f1, [%r2]; +; PTX-NEXT: st.global.f32 [%r4], %f1; +; PTX-NEXT: ret; + %a_global = addrspacecast ptr %a to ptr addrspace(1) + %val = load float, ptr addrspace(1) %a_global + store float %val, ptr %out + ret void +} + +define ptx_kernel void @select(ptr noalias readonly %a, ptr noalias readonly %b, i1 %c, ptr %out) { +; OPT-LABEL: define ptx_kernel void @select( +; OPT-SAME: ptr noalias readonly [[A:%.*]], ptr noalias readonly [[B:%.*]], i1 [[C:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[SELECT:%.*]] = select i1 [[C]], ptr [[A]], ptr [[B]] +; OPT-NEXT: [[SELECT_GLOBAL:%.*]] = addrspacecast ptr [[SELECT]] to ptr addrspace(1) +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[SELECT_GLOBAL]], align 4, !invariant.load [[META0]] +; OPT-NEXT: store i32 [[VAL]], ptr [[OUT]], align 4 +; OPT-NEXT: ret void +; +; PTX-LABEL: select( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b16 %rs<3>; +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u8 %rs1, [select_param_2]; +; PTX-NEXT: and.b16 %rs2, %rs1, 1; +; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0; +; PTX-NEXT: ld.param.u32 %r1, [select_param_0]; +; PTX-NEXT: cvta.to.global.u32 %r2, %r1; +; PTX-NEXT: ld.param.u32 %r3, [select_param_1]; +; PTX-NEXT: cvta.to.global.u32 %r4, %r3; +; PTX-NEXT: ld.param.u32 %r5, [select_param_3]; +; PTX-NEXT: cvta.to.global.u32 %r6, %r5; +; PTX-NEXT: selp.b32 %r7, %r2, %r4, %p1; +; PTX-NEXT: ld.global.nc.u32 %r8, [%r7]; +; PTX-NEXT: st.global.u32 [%r6], %r8; +; PTX-NEXT: ret; + %select = select i1 %c, ptr %a, ptr %b + %select_global = addrspacecast ptr %select to ptr addrspace(1) + %val = load i32, ptr addrspace(1) %select_global + store i32 %val, ptr %out + ret void +} + +define void @not_kernel(ptr noalias readonly %a, ptr %out) { +; OPT-LABEL: define void @not_kernel( +; OPT-SAME: ptr noalias readonly [[A:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[A_GLOBAL:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) +; OPT-NEXT: [[VAL:%.*]] = load float, ptr addrspace(1) [[A_GLOBAL]], align 4 +; OPT-NEXT: store float [[VAL]], ptr [[OUT]], align 4 +; OPT-NEXT: ret void +; +; PTX-LABEL: not_kernel( +; PTX: { +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b32 %f<2>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u32 %r1, [not_kernel_param_0]; +; PTX-NEXT: cvta.to.global.u32 %r2, %r1; +; PTX-NEXT: ld.param.u32 %r3, [not_kernel_param_1]; +; PTX-NEXT: ld.global.f32 %f1, [%r2]; +; PTX-NEXT: st.f32 [%r3], %f1; +; PTX-NEXT: ret; + %a_global = addrspacecast ptr %a to ptr addrspace(1) + %val = load float, ptr addrspace(1) %a_global + store float %val, ptr %out + ret void +} + +%struct.S2 = type { i64, i64 } +@G = private unnamed_addr constant %struct.S2 { i64 1, i64 1 }, align 8 + +define ptx_kernel void @global_load(ptr noalias readonly %a, i1 %c, ptr %out) { +; OPT-LABEL: define ptx_kernel void @global_load( +; OPT-SAME: ptr noalias readonly [[A:%.*]], i1 [[C:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[G_GLOBAL:%.*]] = addrspacecast ptr @G to ptr addrspace(1) +; OPT-NEXT: [[A_GLOBAL:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) +; OPT-NEXT: [[SELECT:%.*]] = select i1 [[C]], ptr addrspace(1) [[G_GLOBAL]], ptr addrspace(1) [[A_GLOBAL]] +; OPT-NEXT: [[VAL:%.*]] = load i64, ptr addrspace(1) [[SELECT]], align 8, !invariant.load [[META0]] +; OPT-NEXT: store i64 [[VAL]], ptr [[OUT]], align 8 +; OPT-NEXT: ret void +; +; PTX-LABEL: global_load( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b16 %rs<3>; +; PTX-NEXT: .reg .b32 %r<7>; +; PTX-NEXT: .reg .b64 %rd<2>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u8 %rs1, [global_load_param_1]; +; PTX-NEXT: and.b16 %rs2, %rs1, 1; +; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0; +; PTX-NEXT: ld.param.u32 %r1, [global_load_param_0]; +; PTX-NEXT: cvta.to.global.u32 %r2, %r1; +; PTX-NEXT: ld.param.u32 %r3, [global_load_param_2]; +; PTX-NEXT: cvta.to.global.u32 %r4, %r3; +; PTX-NEXT: mov.b32 %r5, G; +; PTX-NEXT: selp.b32 %r6, %r5, %r2, %p1; +; PTX-NEXT: ld.global.nc.u64 %rd1, [%r6]; +; PTX-NEXT: st.global.u64 [%r4], %rd1; +; PTX-NEXT: ret; + %g_global = addrspacecast ptr @G to ptr addrspace(1) + %a_global = addrspacecast ptr %a to ptr addrspace(1) + %select = select i1 %c, ptr addrspace(1) %g_global, ptr addrspace(1) %a_global + %val = load i64, ptr addrspace(1) %select + store i64 %val, ptr %out + ret void +} +;. +; OPT: [[META0]] = !{} +;.