Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ option(BUILD_SHARED_LIBS "Build as shared library"
option(BUILD_TESTS "Build unit test programs" OFF)
option(COLLTRACE "Collective Trace Option" ON)
option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" ON)
option(ENABLE_IFC "Enable indirect function call" OFF)
option(ENABLE_IFC "Enable indirect function call" ON)
option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
option(PROFILE "Enable profiling" OFF)
option(TIMETRACE "Enable time-trace during compilation" OFF)
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/all_gather.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
Expand Down
6 changes: 3 additions & 3 deletions src/collectives/device/all_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
Expand Down Expand Up @@ -223,7 +223,7 @@ namespace {
}

template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runTreeUpDown(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) {
Expand Down Expand Up @@ -379,7 +379,7 @@ namespace {
}

template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runTreeSplit(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) {
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/alltoall_pivot.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/broadcast.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
Expand Down
8 changes: 4 additions & 4 deletions src/collectives/device/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
static_assert(FUNC_INDEX_P2P == 5410, "Wrong P2P function index");
static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 5411, "Wrong AllToAllPivot function index");

#ifndef USE_INDIRECT_FUNCTION_CALL
#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
template<unsigned short f, unsigned short l, bool u>
struct Caller {
static __forceinline__ __device__ __host__
Expand Down Expand Up @@ -274,7 +274,7 @@ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction {
public:
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
#else
__device__ void run(struct ncclWorkElem* args) {}
Expand Down Expand Up @@ -568,7 +568,7 @@ __forceinline__ __device__ void ncclKernel(
if (ncclShmem.work.header.funcIndex == FnIndex) {
RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
} else {
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
ncclFuncs[ncclShmem.work.header.funcIndex]();
#else
#ifdef ENABLE_LL128
Expand Down Expand Up @@ -627,7 +627,7 @@ __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDev
// Examples : AllReduce, RING, LL, Sum, uint8
/* Functions for aggregation case */

#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
__device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
Expand Down
4 changes: 2 additions & 2 deletions src/collectives/device/onerank_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

namespace {
template<typename T, typename RedOp>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void oneRankReduce() {
#else
__device__ __attribute__((noinline)) void oneRankReduce() {
Expand Down Expand Up @@ -48,7 +48,7 @@ namespace {
}
}

#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#define INSTANTIATE(devredop, type) \
__device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
oneRankReduce<type, Func##devredop<type>>(); \
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/op128.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ union alignas(16) BytePack<16> {
uint32_t u32[4];
uint64_t u64[2];
ulong2 ul2, native;
#ifndef USE_INDIRECT_FUNCTION_CALL
#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
inline __device__ BytePack<16>& operator=(BytePack<16> other) {
u64[0] = other.u64[0];
u64[1] = other.u64[1];
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/reduce_scatter.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/sendrecv.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
}
}

#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void run(ncclWork *work) {
#else
__device__ __attribute__((noinline)) void run(ncclWork *work) {
Expand Down
2 changes: 1 addition & 1 deletion src/include/collectives.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct ncclDevRedOpFull {
nccl##func##algo##proto

/* Declare all collective operations */
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#define DECL5(func, algo, proto, devredop, type) \
extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
Expand Down
4 changes: 3 additions & 1 deletion src/init.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1688,6 +1688,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
int* parentRanks = NULL;
int cudaArch;
int64_t stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
hipDeviceProp_t devProp;

CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail);
Expand All @@ -1698,7 +1699,8 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
// Set the maximum kernel stack size of all kernels to avoid
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
#ifdef USE_INDIRECT_FUNCTION_CALL
if (stackSize > 0 && ncclParamSetStackSize() == 1) {
CUDACHECK(hipGetDeviceProperties(&devProp, 0));
if (stackSize > 0 && ncclParamSetStackSize() == 1 && devProp.gcnArch != 940 && devProp.gcnArch != 941 && devProp.gcnArch != 942) {
INFO(NCCL_INIT, "Setting cudaLimitStackSize to %zi maxLocalSizeBytes %zi", stackSize, maxLocalSizeBytes);
CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, stackSize));
}
Expand Down