From 178b6b759074597777ce13438efb0e0ba625e429 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 11 Jun 2024 01:28:01 -0700 Subject: [PATCH] 2.22.3-1 Rework core for NVIDIA Trusted Computing * Compress work structs so that they are shared between channels * Utilize the full amount of kernel argument space permitted (4k) before resorting to work fifo. * Rework the task preprocessing phase. * Use a separate abortDevFlag which is kept in sync with abortFlag using cudaMemcpy operations. * Rename src/include/align.h to src/include/bitops.h Add lazy connection establishment for collective operations * Move buffer allocation and connection establishment to the first collective operation using that algorithm. * Accelerate init time and reduce memory usage. * Avoid allocating NVLS buffers if all calls are registered. * Compute algo/proto in ncclLaunchCollTasksInfo early on. * Connect peers in ncclCollPreconnectFunc if not connected already. * Also move shared buffer creation to the first send/recv call. Accelerate intra-node NVLink detection * Make each rank only detect NVLinks attached to its GPU. * Fuse XMLs to reconstruct the full NVLink topology Add init profiling to report time spend in different init phases. * Report timings of bootstrap, allgather, search, connect, etc. * Add new "PROFILE" category for NCCL_DEBUG_SUBSYS. Add support for PCI p2p on split PCI switches * Detect split PCI switches through a kernel module exposing switch information. * Update the topology XML and graph to add those inter-switch connections. Add cost estimation API * Add a new ncclGroupEndSimulate primitive to return the estimated time a group would take. Net/IB: Add separate traffic class for fifo messages * Add NCCL_IB_FIFO_TC to control the traffic class of fifo messages independently from NCCL_IB_TC. Merges PR #1194 Net/IB: Add support for IB router * Use flid instead of lid if subnets do not match * Warn if flid is 0 Optimizations and fixes for device network offload (unpack) * Double the default number of channels * Cache netDeviceType * Fix save/increment head logic to enable Tree support. Support ncclGroupStart/End for ncclCommAbort/Destroy * Allow Abort/Destroy to be called within a group when managing multiple GPUs with a single process. Improve Tuner API * Provide to the plugin the original cost table so that the plugin can leave unknown or disabled algo/proto combinations untouched. * Remove nvlsSupport and collnetSupport. Do not print version to stdout when using a debug file * Also print version from all processes with INFO debug level. Fixes issue #1271 Fix clang warnings in NVTX headers * Update NVTX headers to the latest version Fixes issue #1270 Disable port fusion in heterogeneous systems * Do not fuse ports if a mix of multi-port and single port are detected. Fix NVLS graphs search for dual NICs. * Fix NVLS graph search when we have more than one NIC per GPU. Fix crash with collnetDirect * Add separate graph search for collnetDirect, testing alltoall paths and working similarly to the NVLS search. Fix hang when nodes have different CPU types * Add the CPU type to the rank peer info. * Align all ranks on the CPU type after the first allgather. * Only use the aligned CPU type for all tuning operations. Fixes issue #1136 Fixes issue #1184 Fix performance of registered send/recv operations * Allow for single full size operations * Add INFO to confirm the registration of send/recv buffers. Move all sync ops to finalize stage * Ensure ncclCommDestroy is non-blocking if ncclCommFinalize has been called. Improve error reporting during SHM segment creation Improve support of various compilers Merges PR #1177 Merges PR #1228 Allow net and tuner plugins to be statically linked * Search for ncclNet or ncclTuner symbols in the main binary. Merges PR #979 Plugin examples includes cleanup * Harmonize err.h and common.h usage. * Add mixed plugin with both net and tuner. --- ext-net/example/nccl/common.h | 15 + ext-net/example/nccl/err.h | 1 + ext-net/example/nccl/net.h | 6 +- ext-net/example/nccl/types.h | 4 +- ext-tuner/example/nccl/common.h | 15 + ext-tuner/example/nccl/err.h | 17 + ext-tuner/example/nccl/tuner.h | 43 +- ext-tuner/example/plugin.c | 13 +- makefiles/version.mk | 4 +- src/bootstrap.cc | 5 +- src/channel.cc | 8 +- src/collectives.cc | 63 + src/debug.cc | 47 +- src/device/all_gather.h | 117 +- src/device/all_reduce.h | 290 +- src/device/broadcast.h | 37 +- src/device/common.cu | 6 +- src/device/common.h | 404 ++- src/device/generate.py | 4 +- src/device/network/unpack/unpack.h | 14 +- src/device/network/unpack/unpack_defs.h | 2 +- src/device/prims_ll.h | 10 +- src/device/prims_ll128.h | 5 +- src/device/prims_simple.h | 58 +- src/device/reduce.h | 33 +- src/device/reduce_kernel.h | 43 +- src/device/reduce_scatter.h | 117 +- src/device/sendrecv.h | 210 +- src/enqueue.cc | 2619 +++++++++-------- src/graph/connect.cc | 23 +- src/graph/paths.cc | 25 +- src/graph/search.cc | 72 +- src/graph/topo.cc | 140 +- src/graph/topo.h | 2 +- src/graph/tuning.cc | 32 +- src/graph/xml.cc | 138 +- src/graph/xml.h | 25 +- src/group.cc | 261 +- src/include/align.h | 47 - src/include/alloc.h | 125 +- src/include/bitops.h | 277 ++ src/include/channel.h | 41 +- src/include/checks.h | 8 +- src/include/collectives.h | 8 + src/include/comm.h | 282 +- src/include/cudawrap.h | 4 + src/include/debug.h | 9 +- src/include/device.h | 273 +- src/include/enqueue.h | 1 + src/include/gdrwrap.h | 9 +- src/include/graph.h | 7 +- src/include/group.h | 15 +- src/include/info.h | 123 - src/include/nccl_common.h | 30 +- src/include/nccl_tuner.h | 56 +- src/include/net.h | 4 +- src/include/nvmlwrap.h | 38 + src/include/nvtx.h | 2 +- src/include/nvtx3/nvToolsExt.h | 96 +- src/include/nvtx3/nvToolsExtCounters.h | 335 +++ src/include/nvtx3/nvToolsExtCuda.h | 4 +- src/include/nvtx3/nvToolsExtCudaRt.h | 4 +- src/include/nvtx3/nvToolsExtMem.h | 694 +++++ src/include/nvtx3/nvToolsExtMemCudaRt.h | 150 + src/include/nvtx3/nvToolsExtOpenCL.h | 6 +- src/include/nvtx3/nvToolsExtPayload.h | 977 ++++-- src/include/nvtx3/nvToolsExtPayloadHelper.h | 170 ++ .../nvtx3/nvToolsExtSemanticsCounters.h | 88 + src/include/nvtx3/nvToolsExtSemanticsScope.h | 30 + src/include/nvtx3/nvToolsExtSync.h | 26 +- src/include/nvtx3/nvtx3.hpp | 21 +- .../nvtx3/nvtxDetail/nvtxExtHelperMacros.h | 31 + .../nvtxExtImpl.h | 50 +- .../nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h | 148 + .../nvtxDetail/nvtxExtImplMemCudaRt_v1.h | 74 + .../nvtx3/nvtxDetail/nvtxExtImplMem_v1.h | 133 + .../nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h | 155 + .../nvtxExtInit.h | 161 +- .../nvtxDetail/nvtxExtPayloadHelperInternal.h | 272 ++ .../nvtxExtPayloadTypeInfo.h | 20 +- .../nvtxExtTypes.h | 0 src/include/nvtx3/nvtxDetail/nvtxImpl.h | 21 +- src/include/nvtx3/nvtxDetail/nvtxInit.h | 6 +- src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h | 2 +- .../nvtxExtDetail/nvtxExtImplPayload_v1.h | 86 - src/include/p2p.h | 3 + src/include/proxy.h | 26 +- src/include/register.h | 5 + src/include/transport.h | 36 +- src/include/tuner.h | 5 +- src/include/utils.h | 87 +- src/init.cc | 905 +++--- src/init_nvtx.cc | 13 +- src/misc/argcheck.cc | 2 - src/misc/cudawrap.cc | 8 + src/misc/gdrwrap.cc | 10 +- src/misc/ipcsocket.cc | 4 +- src/misc/nvmlwrap.cc | 41 + src/misc/param.cc | 2 +- src/misc/shmutils.cc | 34 +- src/misc/socket.cc | 10 +- src/misc/tuner.cc | 179 +- src/misc/utils.cc | 79 +- src/nccl.h.in | 26 + src/net.cc | 169 +- src/proxy.cc | 90 +- src/register.cc | 2 + src/transport.cc | 14 +- src/transport/coll_net.cc | 301 +- src/transport/generic.cc | 36 + src/transport/net.cc | 20 +- src/transport/net_ib.cc | 159 +- src/transport/nvls.cc | 596 ++-- src/transport/p2p.cc | 11 +- src/transport/shm.cc | 1 + 115 files changed, 8595 insertions(+), 4326 deletions(-) create mode 100644 ext-net/example/nccl/common.h create mode 100644 ext-tuner/example/nccl/common.h create mode 100644 ext-tuner/example/nccl/err.h delete mode 100644 src/include/align.h create mode 100644 src/include/bitops.h create mode 100644 src/include/nvtx3/nvToolsExtCounters.h create mode 100644 src/include/nvtx3/nvToolsExtMem.h create mode 100644 src/include/nvtx3/nvToolsExtMemCudaRt.h create mode 100644 src/include/nvtx3/nvToolsExtPayloadHelper.h create mode 100644 src/include/nvtx3/nvToolsExtSemanticsCounters.h create mode 100644 src/include/nvtx3/nvToolsExtSemanticsScope.h create mode 100644 src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h rename src/include/nvtx3/{nvtxExtDetail => nvtxDetail}/nvtxExtImpl.h (79%) create mode 100644 src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h create mode 100644 src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h create mode 100644 src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h create mode 100644 src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h rename src/include/nvtx3/{nvtxExtDetail => nvtxDetail}/nvtxExtInit.h (71%) create mode 100644 src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h rename src/include/nvtx3/{nvtxExtDetail => nvtxDetail}/nvtxExtPayloadTypeInfo.h (90%) rename src/include/nvtx3/{nvtxExtDetail => nvtxDetail}/nvtxExtTypes.h (100%) delete mode 100644 src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h create mode 100644 src/transport/generic.cc diff --git a/ext-net/example/nccl/common.h b/ext-net/example/nccl/common.h new file mode 100644 index 000000000..912925225 --- /dev/null +++ b/ext-net/example/nccl/common.h @@ -0,0 +1,15 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef COMMON_H_ +#define COMMON_H_ + +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; + +typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); + +#endif diff --git a/ext-net/example/nccl/err.h b/ext-net/example/nccl/err.h index 0a2267719..bb92e8354 100644 --- a/ext-net/example/nccl/err.h +++ b/ext-net/example/nccl/err.h @@ -11,6 +11,7 @@ typedef enum { ncclSuccess = 0, ncclSystemError = 2, ncclInternalError = 3, ncclInvalidArgument = 4, + ncclInvalidUsage = 5, ncclRemoteError = 6 } ncclResult_t; #endif diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h index 2f455c60f..2aea8c439 100644 --- a/ext-net/example/nccl/net.h +++ b/ext-net/example/nccl/net.h @@ -8,6 +8,7 @@ #include #include +#include "common.h" #include "err.h" #define NCCL_NET_HANDLE_MAXSIZE 128 @@ -19,11 +20,6 @@ // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 32 -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; -typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys; - -typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); - #include "net_v8.h" #include "net_v7.h" #include "net_v6.h" diff --git a/ext-net/example/nccl/types.h b/ext-net/example/nccl/types.h index 0a5d83788..f43fdc163 100644 --- a/ext-net/example/nccl/types.h +++ b/ext-net/example/nccl/types.h @@ -2,8 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_ERR_H_ -#define NCCL_ERR_H_ +#ifndef NCCL_TYPES_H_ +#define NCCL_TYPES_H_ /* Data types */ typedef enum { ncclInt8 = 0, ncclChar = 0, diff --git a/ext-tuner/example/nccl/common.h b/ext-tuner/example/nccl/common.h new file mode 100644 index 000000000..912925225 --- /dev/null +++ b/ext-tuner/example/nccl/common.h @@ -0,0 +1,15 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef COMMON_H_ +#define COMMON_H_ + +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; + +typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); + +#endif diff --git a/ext-tuner/example/nccl/err.h b/ext-tuner/example/nccl/err.h new file mode 100644 index 000000000..bb92e8354 --- /dev/null +++ b/ext-tuner/example/nccl/err.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_ERR_H_ +#define NCCL_ERR_H_ + +/* Error type for plugins */ +typedef enum { ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclRemoteError = 6 } ncclResult_t; + +#endif diff --git a/ext-tuner/example/nccl/tuner.h b/ext-tuner/example/nccl/tuner.h index 57825b99c..a1f18d393 100644 --- a/ext-tuner/example/nccl/tuner.h +++ b/ext-tuner/example/nccl/tuner.h @@ -8,15 +8,24 @@ #ifndef NCCL_TUNER_H_ #define NCCL_TUNER_H_ -#include "nccl.h" +#include +#include -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; -typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys; - -typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); +#include "common.h" +#include "err.h" #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now -typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t; +typedef enum { + ncclFuncBroadcast = 0, + ncclFuncReduce = 1, + ncclFuncAllGather = 2, + ncclFuncReduceScatter = 3, + ncclFuncAllReduce = 4, + ncclFuncSendRecv = 5, + ncclFuncSend = 6, + ncclFuncRecv = 7, + ncclNumFuncs = 8 +} ncclFunc_t; #define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* #define NCCL_ALGO_UNDEF -1 @@ -33,6 +42,8 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed #define NCCL_PROTO_LL128 1 #define NCCL_PROTO_SIMPLE 2 +#define NCCL_ALGO_PROTO_IGNORE -1.0 + // API to be implemented by external tuner typedef struct { // Name of the tuner @@ -52,31 +63,33 @@ typedef struct { // - context: tuner context object // - collType: collective type , e.g., allreduce, allgather… // - nBytes: collective size in bytes - // - collNetSupport: whether collnet supports this type - // - nvlsSupport: whether nvlink sharp supports this time // - numPipeOps: number of operations in the group + // - numAlgo: number of algorithms in collCostTable + // - numProto: number of protocols in collCostTable // // Outputs: - // - algorithm: selected algorithm to be used for the given collective - // - protocol: selected protocol to be used for the given collective // - nChannels: number of channels (hence SMs) to be used. // + // InOut: + // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. + // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). + // // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the // default tuning for the given collective. // Also, the plugin is allowed to not set any output, or set only the // algorithm and protocol, but not only the algorithm or only the protocol. // Unset fields will be set automatically by NCCL. ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, - int collNetSupport, int nvlsSupport, int numPipeOps, - int *algorithm, int *protocol, int* nChannels); + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int* nChannels); // Terminates the plugin and cleans up any resources that the plugin allocated. // context: tuner context object ncclResult_t (*destroy)(void* context); -} ncclTuner_v2_t; +} ncclTuner_v3_t; -typedef ncclTuner_v2_t ncclTuner_t; +typedef ncclTuner_v3_t ncclTuner_t; -#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2" +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3" #endif diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c index 3c669433a..c3cf00dfd 100644 --- a/ext-tuner/example/plugin.c +++ b/ext-tuner/example/plugin.c @@ -11,14 +11,21 @@ __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; } __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes, - int collNetSupport, int nvlsSupport, int numPipeOps, - int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; } + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int* nChannels) { + // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo + if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) { + collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; + } + *nChannels = 1; + return ncclSuccess; +} __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; } #define PLUGIN_NAME "Example" -const ncclTuner_v2_t ncclTunerPlugin_v2 = { +const ncclTuner_v3_t ncclTunerPlugin_v3 = { .name = PLUGIN_NAME, .init = pluginInit, .getCollInfo = pluginGetCollInfo, diff --git a/makefiles/version.mk b/makefiles/version.mk index d4da30daf..9039cb7dd 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 21 -NCCL_PATCH := 5 +NCCL_MINOR := 22 +NCCL_PATCH := 3 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/bootstrap.cc b/src/bootstrap.cc index cff2df50d..a7d775440 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -201,7 +201,6 @@ ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFrom ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) { memset(handle, 0, sizeof(ncclBootstrapHandle)); - NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic))); const char* env = ncclGetEnv("NCCL_COMM_ID"); if (env) { @@ -210,7 +209,9 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) { WARN("Invalid NCCL_COMM_ID, please use format: : or []: or :"); return ncclInvalidArgument; } + handle->magic = NCCL_MAGIC; } else { + NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic))); memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); NCCLCHECK(bootstrapCreateRoot(handle, false)); } @@ -626,7 +627,7 @@ ncclResult_t bootstrapClose(void* commState) { struct bootstrapState* state = (struct bootstrapState*)commState; if (state->unexpectedConnections != NULL) { unexpectedFree(state); - if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) { + if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) { WARN("Unexpected connections are not empty"); return ncclInternalError; } diff --git a/src/channel.cc b/src/channel.cc index 52591e0e7..b3a8f29b5 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -7,16 +7,17 @@ #include "channel.h" #include "param.h" #include "gdrwrap.h" +#include "transport.h" ncclResult_t initChannel(struct ncclComm* comm, int channelId) { struct ncclChannel* channel = &comm->channels[channelId]; if (channel->id != -1) return ncclSuccess; int nRanks = comm->nRanks; - int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks; + int nvlsRanks = comm->localRanks; int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */; channel->id = channelId; - channel->workFifoSent = 0; + channel->workFifoProduced = 0; struct ncclSharedResources* sharedRes = comm->sharedRes; @@ -74,7 +75,8 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); - int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks; + int nvlsRanks = comm->localRanks; + if (share) { channel->nvlsPeers = parent->channels[channelId].nvlsPeers; channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers; diff --git a/src/collectives.cc b/src/collectives.cc index 571134fca..e21807e04 100644 --- a/src/collectives.cc +++ b/src/collectives.cc @@ -9,6 +9,69 @@ #include "enqueue.h" #include "nccl.h" +const char* ncclFuncToString(ncclFunc_t fn) { + switch (fn) { + case ncclFuncAllGather: return "AllGather"; + case ncclFuncAllReduce: return "AllReduce"; + case ncclFuncBroadcast: return "Broadcast"; + case ncclFuncRecv: return "Recv"; + case ncclFuncReduce: return "Reduce"; + case ncclFuncReduceScatter: return "ReduceScatter"; + case ncclFuncSendRecv: return "SendRecv"; + case ncclFuncSend: return "Send"; + default: return "Invalid"; + } +} + +const char* ncclDevRedOpToString(ncclDevRedOp_t op) { + switch (op) { + case ncclDevSum: return "Sum"; + case ncclDevProd: return "Prod"; + case ncclDevMinMax: return "MinMax"; + case ncclDevPreMulSum: return "PreMulSum"; + case ncclDevSumPostDiv: return "SumPostDiv"; + default: return "Unknown"; + } +} + +const char* ncclDatatypeToString(ncclDataType_t type) { + switch (type) { + case ncclInt8: return "ncclInt8"; + case ncclInt32: return "ncclInt32"; + case ncclUint32: return "ncclUint32"; + case ncclInt64: return "ncclInt64"; + case ncclUint64: return "ncclUint64"; + case ncclFloat16: return "ncclFloat16"; + case ncclFloat32: return "ncclFloat32"; + case ncclFloat64: return "ncclFloat64"; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: return "ncclBfloat16"; +#endif + default: return "Unknown"; + } +} + +const char* ncclAlgoToString(int algo) { + switch (algo) { + case NCCL_ALGO_TREE: return "TREE"; + case NCCL_ALGO_RING: return "RING"; + case NCCL_ALGO_COLLNET_DIRECT: return "COLLNET_DIRECT"; + case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN"; + case NCCL_ALGO_NVLS: return "NVLS"; + case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE"; + default: return "Unknown"; + } +} + +const char* ncclProtoToString(int proto) { + switch (proto) { + case NCCL_PROTO_LL: return "LL"; + case NCCL_PROTO_LL128: return "LL128"; + case NCCL_PROTO_SIMPLE: return "SIMPLE"; + default: return "Unknown"; + } +} + NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, diff --git a/src/debug.cc b/src/debug.cc index 522999b44..dde8e8fcb 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -8,7 +8,10 @@ #include "nccl_net.h" #include #include +#include +#include #include +#include #include "param.h" int ncclDebugLevel = -1; @@ -16,14 +19,15 @@ static int pid = -1; static char hostname[1024]; thread_local int ncclDebugNoWarn = 0; char ncclLastError[1024] = ""; // Global string for the last error in human readable form -uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV +static uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV FILE *ncclDebugFile = stdout; -pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER; -std::chrono::steady_clock::time_point ncclEpoch; +static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER; +static std::chrono::steady_clock::time_point ncclEpoch; +static bool ncclWarnSetDebugInfo = false; static __thread int tid = -1; -void ncclDebugInit() { +static void ncclDebugInit() { pthread_mutex_lock(&ncclDebugLock); if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; } const char* nccl_debug = ncclGetEnv("NCCL_DEBUG"); @@ -83,6 +87,8 @@ void ncclDebugInit() { mask = NCCL_BOOTSTRAP; } else if (strcasecmp(subsys, "REG") == 0) { mask = NCCL_REG; + } else if (strcasecmp(subsys, "PROFILE") == 0) { + mask = NCCL_PROFILE; } else if (strcasecmp(subsys, "ALL") == 0) { mask = NCCL_ALL; } @@ -94,6 +100,15 @@ void ncclDebugInit() { free(ncclDebugSubsys); } + const char* ncclWarnSetDebugInfoEnv = ncclGetEnv("NCCL_WARN_ENABLE_DEBUG_INFO"); + if (ncclWarnSetDebugInfoEnv != NULL && strlen(ncclWarnSetDebugInfoEnv) > 0) { + int64_t value; + errno = 0; + value = strtoll(ncclWarnSetDebugInfoEnv, NULL, 0); + if (!errno) + ncclWarnSetDebugInfo = value; + } + // Cache pid and hostname getHostName(hostname, 1024, '.'); pid = getpid(); @@ -143,8 +158,6 @@ void ncclDebugInit() { pthread_mutex_unlock(&ncclDebugLock); } -NCCL_PARAM(WarnSetDebugInfo, "WARN_ENABLE_DEBUG_INFO", 0); - /* Common logging function used by the INFO, WARN and TRACE macros * Also exported to the dynamically loadable Net transport modules so * they can share the debugging mechanisms and output files @@ -178,7 +191,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file if (level == NCCL_LOG_WARN) { len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line); - if (ncclParamWarnSetDebugInfo()) ncclDebugLevel = NCCL_LOG_INFO; + if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO; } else if (level == NCCL_LOG_INFO) { len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev); } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) { @@ -190,17 +203,15 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file hostname, pid, tid, cudaDev, timestamp, filefunc, line); } - if (len) { - va_list vargs; - va_start(vargs, fmt); - len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); - va_end(vargs); - // vsnprintf may return len > sizeof(buffer) in the case of a truncated output. - // Rewind len so that we can replace the final \0 by \n - if (len > sizeof(buffer)) len = sizeof(buffer)-1; - buffer[len++] = '\n'; - fwrite(buffer, 1, len, ncclDebugFile); - } + va_list vargs; + va_start(vargs, fmt); + len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); + va_end(vargs); + // vsnprintf may return len > sizeof(buffer) in the case of a truncated output. + // Rewind len so that we can replace the final \0 by \n + if (len > sizeof(buffer)) len = sizeof(buffer)-1; + buffer[len++] = '\n'; + if (len) fwrite(buffer, 1, len, ncclDebugFile); } NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0); diff --git a/src/device/all_gather.h b/src/device/all_gather.h index 809e8ae12..8fe224848 100644 --- a/src/device/all_gather.h +++ b/src/device/all_gather.h @@ -10,30 +10,26 @@ namespace { template - __device__ __forceinline__ void runRing(ncclWorkElem *args) { - const int tid = threadIdx.x; - const int nthreads = (int)args->nWarps * WARP_SIZE; + __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclRing *ring = &ncclShmem.channel.ring; const int *ringRanks = ring->userRanks; const int nranks = ncclShmem.comm.nRanks; - const size_t chunkCount = args->chunkCount; - const size_t channelCount = args->workCount; - const size_t gridOffset = args->workOffset; - const size_t count = args->count; + size_t count, partOffset, partCount, chunkCount; + ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount); size_t offset; size_t dataOffset; int nelem; int rankDest; - T *inputBuf = (T*)args->sendbuff; - T *outputBuf = (T*)args->recvbuff; + T *inputBuf = (T*)work->sendbuff; + T *outputBuf = (T*)work->recvbuff; Primitives, 1, Proto, 0> prims - (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg); + (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg); - for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { + for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) { /////////////// begin AllGather steps /////////////// - nelem = min(chunkCount, channelCount - elemOffset); - dataOffset = gridOffset + elemOffset; + nelem = min(chunkCount, partCount - elemOffset); + dataOffset = partOffset + elemOffset; // step 0: push data to next GPU rankDest = ringRanks[0]; @@ -64,52 +60,50 @@ namespace { } template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { using Proto = ProtoSimple; - runRing(args); + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - runRing(args); +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - runRing(args); +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - const int tid = threadIdx.x; +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { struct ncclNvls* nvls = &ncclShmem.channel.nvls; - const ssize_t count = args->count; const ssize_t rank = ncclShmem.comm.rank; - const size_t chunkCount = args->chunkCount; - size_t gridOffset = args->workOffset; - size_t channelCount = args->workCount; + size_t count, gridOffset, channelCount; + size_t chunkCount; + ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount); size_t offset; int nelem; - const int nThreadsBcast = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE; - const int nThreadsGather = args->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast; + const int nThreadsBcast = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE; + const int nThreadsGather = work->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast; const int tidEndGather = nThreadsGather; const int tidEndBcast = tidEndGather + nThreadsBcast; - if (!args->regUsed) { + if (!work->regUsed) { if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, - args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + prims(tid, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff, + work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); @@ -119,8 +113,8 @@ struct RunWorkElement; Primitives, /*Direct=*/0, Proto, 0> - prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, - args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0); + prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, work->sendbuff, NULL, + work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); @@ -133,7 +127,7 @@ struct RunWorkElement; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL, - args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); /* used as sync */ prims.scatter(0, 0, 0, 0, -1, 0); @@ -144,8 +138,8 @@ struct RunWorkElement; Primitives, /*Direct=*/1, Proto, 0> - prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, args->sendbuff, NULL, - args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, args); + prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, work->sendbuff, NULL, + work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, work); /* used as sync */ prims.recv(0, 0); @@ -161,10 +155,10 @@ struct RunWorkElement -struct RunWorkElement { +struct RunWorkColl { template struct Scatterer { - struct ncclWorkElem* args; + struct ncclDevWorkColl* work; ssize_t chunkSize; ssize_t railGridOffset; @@ -179,13 +173,13 @@ struct RunWorkElementnHeads; - int bid = args->bid; - char* inbuf = (char*)args->sendbuff; - char* outbuf = (char*)args->recvbuff; - ssize_t sizePerRank = args->count*sizeof(T); + int part = ncclShmem.channelId - work->channelLo; + char* inbuf = (char*)work->sendbuff; + char* outbuf = (char*)work->recvbuff; + ssize_t sizePerRank = work->collnet.count*sizeof(T); bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank); - ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank); + ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank); ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank); int railAllSize = railAllEnd - railAllBeg; if (tid < nDsts) dstSizes[tid] = railAllSize; @@ -232,28 +226,27 @@ struct RunWorkElementnChannels; + __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { + const int part = ncclShmem.channelId - work->channelLo; + const int nChannels = work->channelHi - work->channelLo + 1; struct ncclDirect* direct = &ncclShmem.channel.collnetDirect; int const &nNodes = ncclShmem.comm.nNodes; - ssize_t chunkSize = int(args->chunkCount); - ssize_t const &sizePerRank = args->count; - + ssize_t sizePerRank = work->collnet.count*sizeof(T); + size_t chunkSize = work->collnet.chunkCount; bool isMultiRail = (direct->nHeads > 1); int nWarps1 = 1; int nWarps2 = (isMultiRail ? 2 : 1); int nWarps3 = (isMultiRail ? 2 : 0); - float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3); + float denom = float(work->nWarps)/float(nWarps1+nWarps2+nWarps3); nWarps3 = int(denom*nWarps3); nWarps2 = int(denom*nWarps2); - nWarps1 = args->nWarps - (nWarps2+nWarps3); + nWarps1 = work->nWarps - (nWarps2+nWarps3); using Proto = ProtoSimple<1, 1>; int tn = nWarps1*WARP_SIZE; if (tid < tn) { - if (args->regUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (tid == 0) { int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps); @@ -262,10 +255,10 @@ struct RunWorkElement, /*Direct=*/0, Proto, 0> - prims(tid, tn, nullptr, &direct->out, args->sendbuff, nullptr, + prims(tid, tn, nullptr, &direct->out, work->sendbuff, nullptr, /*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1); for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { - ssize_t railAllBeg = railGridOffset + args->bid * chunkSize; + ssize_t railAllBeg = railGridOffset + part * chunkSize; ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank); ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank; ssize_t railOneEnd = railOneBeg + sizePerRank; @@ -280,7 +273,7 @@ struct RunWorkElementregUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (tid == 0) { int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps); @@ -293,10 +286,10 @@ struct RunWorkElement scat; - scat.args = args; + scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; - prims.process(scat); + prims.template process(scat); } } return; @@ -311,10 +304,10 @@ struct RunWorkElement scat; - scat.args = args; + scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; - prims.process(scat); + prims.template process(scat); } return; } diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h index 49f8dc65b..293138f4d 100644 --- a/src/device/all_reduce.h +++ b/src/device/all_reduce.h @@ -10,28 +10,27 @@ namespace { template - __device__ __forceinline__ void runRing(ncclWorkElem *args) { - const int tid = threadIdx.x; - const int nthreads = (int)args->nWarps * WARP_SIZE; + __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclRing *ring = &ncclShmem.channel.ring; int ringIx = ring->index; - ssize_t chunkCount = args->chunkCount; const int nranks = ncclShmem.comm.nRanks; + ssize_t gridOffset; + ssize_t channelCount; + ssize_t chunkCount; + ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount); const ssize_t loopCount = nranks * chunkCount; ssize_t offset; - ssize_t gridOffset = args->workOffset; - ssize_t channelCount = args->workCount; int nelem; int chunk; Primitives, 1, Proto, 0> prims - (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg); + (tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { ssize_t remCount = channelCount - elemOffset; ssize_t chunkOffset; - if (remCount < loopCount) chunkCount = args->lastChunkCount; + if (remCount < loopCount) chunkCount = alignUp(divUp(remCount, nranks), 16/sizeof(T)); auto modRanks = [&]__device__(int r)->int { return r - (r >= nranks ? nranks : 0); @@ -75,24 +74,24 @@ namespace { chunkOffset = chunk * chunkCount; offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); + prims.directRecv(offset, nelem); } } template - __device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) { - const int tid = threadIdx.x; - const int nthreads = (int)args->nWarps * WARP_SIZE; + __device__ __forceinline__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclTree *tree = &ncclShmem.channel.tree; - const size_t channelCount = args->workCount; - const size_t gridOffset = args->workOffset; - const size_t chunkCount = args->chunkCount; + size_t gridOffset; + size_t channelCount; + size_t chunkCount; + ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount); size_t offset; int nelem; { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) Primitives, /*Direct=*/0, Proto, 0> prims - (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg); + (tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg); if (tree->up == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; @@ -118,7 +117,7 @@ namespace { { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) Primitives, /*Direct=*/1, Proto, 0> prims - (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg); + (tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg); if (tree->up == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; @@ -144,16 +143,14 @@ namespace { } template - __device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) { - const int tid = threadIdx.x; - const int nthreads = (int)args->nWarps * WARP_SIZE; + __device__ __forceinline__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclTree *tree = &ncclShmem.channel.tree; - const size_t chunkCount = args->chunkCount; - const size_t gridOffset = args->workOffset; - const size_t channelCount = args->workCount; + size_t gridOffset; + size_t channelCount; + size_t chunkCount; + ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount); size_t offset; int nelem; - int nthreadsSplit; if (Proto::Id == NCCL_PROTO_SIMPLE) { nthreadsSplit = nthreads/2; @@ -167,7 +164,7 @@ namespace { if (tree->up == -1) { // Reduce and broadcast. Max number of recv is 2, max number of send is 2 Primitives, /*Direct=*/1, Proto, 0> - prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg); + prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); @@ -184,7 +181,7 @@ namespace { * but the ctor above for tree roots would be DirectRecv=0 DirectSend=1. */ Primitives, /*Direct=*/1, Proto, 0> - prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth); + prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth); if (tree->down[0] == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; @@ -203,8 +200,8 @@ namespace { else { // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local) Primitives, /*Direct=*/1, Proto, 0> - prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, - args->redOpArg, 1*Proto::MaxGroupWidth); + prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, work->sendbuff, work->recvbuff, + work->redOpArg, 1*Proto::MaxGroupWidth); if (tree->down[0] == -1) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; @@ -224,34 +221,33 @@ namespace { } template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { using Proto = ProtoSimple; - runRing(args); + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { #if CUDART_VERSION >= 11020 && CUDART_VERSION < 11040 && __CUDA_ARCH__ >= 800 - runTreeUpDown>(args); + runTreeUpDown>(tid, nthreads, work); #else - runTreeSplit>(args); + runTreeSplit>(tid, nthreads, work); #endif } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { static constexpr int COLLNET_COPY_THREADS = 96; - const int tid = threadIdx.x; - const int bid = args->bid; - const int nChannels = args->nChannels; + const int bid = ncclShmem.channelId - work->channelLo; + const int nChannels = work->channelHi - work->channelLo + 1; struct ncclDirect* direct = &ncclShmem.channel.collnetDirect; - const ssize_t chunkSize = args->chunkCount; - const ssize_t size = args->count; + const ssize_t chunkSize = work->collnet.chunkCount; + const ssize_t size = work->collnet.count; const ssize_t loopSize = nChannels*direct->nHeads*chunkSize; const int hasUp = (direct->up[0] >= 0) ? 1 : 0; @@ -259,7 +255,7 @@ struct RunWorkElementnWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast; + const int nThreadsReduce = work->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast; const int tidStartBcast = nThreadsGather; const int tidStartScatter = tidStartBcast + nThreadsBcast; const int tidStartReduce = tidStartScatter + nThreadsScatter; @@ -269,12 +265,12 @@ struct RunWorkElement= tidStartScatter && tid < tidStartReduce && hasUp) { // Scatter Primitives, /*Direct=*/1, Proto, 0> - prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff, - args->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, args); + prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff, + work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; int nelem = min(direct->nHeads*chunkSize, size-offset); - if (args->regUsed) { + if (work->regUsed) { prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift); } else { prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift); @@ -284,12 +280,12 @@ struct RunWorkElement, /*Direct=*/1, Proto, 0> - prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff, - args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, args); + prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff, + work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); - if (args->regUsed) { + if (work->regUsed) { prims.directRecvReduceSend(offset, nelem); } else { prims.recvReduceSend(offset, nelem); @@ -297,7 +293,7 @@ struct RunWorkElementregUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (tid == tidStartReduce) { int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps); @@ -305,8 +301,8 @@ struct RunWorkElement, /*Direct=*/0, Proto, 0> - prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff, - args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); + prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, work->sendbuff, work->recvbuff, + work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); @@ -317,8 +313,8 @@ struct RunWorkElement, /*Direct=*/1, Proto, 0> - prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff, - args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, args); + prims(tid, nThreadsGather, direct->up, NULL, work->sendbuff, work->recvbuff, + work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; int nelem = min(direct->nHeads*chunkSize, size-offset); @@ -328,15 +324,15 @@ struct RunWorkElement, /*Direct=*/1, Proto, 0> - prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff, - args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, args); + prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff, + work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true); } } else { - if (args->regUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (tid == tidStartBcast) { int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps); @@ -345,8 +341,8 @@ struct RunWorkElement, /*Direct=*/0, Proto, 0> - prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff, - args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0); + prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, work->sendbuff, work->recvbuff, + work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize; int nelem = min(chunkSize, size - offset); @@ -359,18 +355,16 @@ struct RunWorkElement -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - const int tid = threadIdx.x; +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { struct ncclNvls* nvls = &ncclShmem.channel.nvls; - ssize_t chunkSize = args->chunkCount; const bool hasOut = nvls->out != -1; const int nranks = ncclShmem.comm.nRanks; const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE; - const int bcastWarps = hasOut ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0; - const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5); - const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1; - const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1; + const int bcastWarps = hasOut ? (work->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0; + const int reduceWarps = work->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5); + const int scatterWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1; + const int gatherWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1; const int nThreadsScatter = scatterWarps*WARP_SIZE; const int nThreadsGather = gatherWarps*WARP_SIZE; @@ -381,35 +375,37 @@ struct RunWorkElementoneNode) { + if (work->oneNode) { + ssize_t gridOffset, channelCount, chunkSize; + ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkSize); const ssize_t loopCount = nvls->nHeads * chunkSize; - const ssize_t channelCount = args->workCount; - const ssize_t gridOffset = args->workOffset; ssize_t offset; int nelem; + int remCount = channelCount%(nvls->nHeads*chunkSize); + int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T)); if (tid < tidEndScatter) { // Scatter using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, - args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL, + work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { - if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount; + if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize; offset = gridOffset + elemOffset; - nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset); + nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset); prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> - prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, - args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); + prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff, + work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { - if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount; + if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize; offset = gridOffset + elemOffset; - nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset); + nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndReduce) { @@ -417,10 +413,10 @@ struct RunWorkElement; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL, - args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args); + work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { ssize_t chunkOffset; - if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount; + if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize; chunkOffset = elemOffset + nvls->headRank * chunkSize; offset = gridOffset + chunkOffset; nelem = min(chunkSize, channelCount - chunkOffset); @@ -428,30 +424,32 @@ struct RunWorkElementbid; - const ssize_t loopSize = args->nChannels * nvls->nHeads * chunkSize; - const ssize_t size = args->count; + const int bid = ncclShmem.channelId - work->channelLo; + const int nChannels = work->channelHi - work->channelLo + 1; + const ssize_t chunkSize = work->collnet.chunkCount; + const ssize_t loopSize = nChannels * nvls->nHeads * chunkSize; + const ssize_t size = work->collnet.count; if (tid < tidEndScatter) { // Scatter using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, - args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL, + work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize; - int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset); + int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset); prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> - prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, - args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); + prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff, + work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize; - int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset); + int nelem = work->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndReduce && nvls->headRank != -1) { @@ -460,7 +458,7 @@ struct RunWorkElement; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL, - args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args); + work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; int nelem = min(chunkSize, size - offset); @@ -471,7 +469,7 @@ struct RunWorkElement; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL, - args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args); + work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; int nelem = min(chunkSize, size - offset); @@ -483,7 +481,7 @@ struct RunWorkElement; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL, - args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args); + work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; int nelem = min(chunkSize, size - offset); @@ -495,25 +493,25 @@ struct RunWorkElement -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - const int tid = threadIdx.x; +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { struct ncclNvls* nvls = &ncclShmem.channel.nvls; const int treeUp = nvls->treeUp; const int* treeDown = nvls->treeDown; - ssize_t chunkCount = args->chunkCount; + ssize_t gridOffset, channelCount, chunkCount; + ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount); const ssize_t loopCount = nvls->nHeads * chunkCount; - const ssize_t channelCount = args->workCount; - const ssize_t gridOffset = args->workOffset; const int nranks = ncclShmem.comm.nRanks; const bool hasUp = treeUp != -1; const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE; - const int bcastWarps = hasUp ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0; - const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5); - const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1; - const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1; + const int bcastWarps = hasUp ? (work->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0; + const int reduceWarps = work->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5); + const int scatterWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1; + const int gatherWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1; ssize_t offset; int nelem; + int remCount = channelCount%(nvls->nHeads*chunkCount); + int lastChunkCount = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T)); const int nThreadsScatter = scatterWarps*WARP_SIZE; const int nThreadsGather = gatherWarps*WARP_SIZE; @@ -528,24 +526,24 @@ struct RunWorkElement; Primitives, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, - args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL, + work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { - if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount; + if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount; offset = gridOffset + elemOffset; - nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset); + nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset); prims.scatter(offset, nelem, chunkCount, chunkCount, -1, 0); } } else if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> - prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, - args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); + prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff, + work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { - if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount; + if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount; offset = gridOffset + elemOffset; - nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset); + nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset); prims.gather(offset, nelem, chunkCount, chunkCount, -1, 0); } } else if (tid < tidEndReduce && nvls->headRank != -1) { @@ -554,10 +552,10 @@ struct RunWorkElement; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL, - args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args); + work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { ssize_t chunkOffset; - if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount; + if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount; chunkOffset = elemOffset + nvls->headRank * chunkCount; offset = gridOffset + chunkOffset; nelem = min(chunkCount, channelCount - chunkOffset); @@ -568,10 +566,10 @@ struct RunWorkElement; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL, - args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args); + work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { ssize_t chunkOffset; - if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount; + if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount; chunkOffset = elemOffset + nvls->headRank * chunkCount; offset = gridOffset + chunkOffset; nelem = min(chunkCount, channelCount - chunkOffset); @@ -583,10 +581,10 @@ struct RunWorkElement; Primitives, /*Direct=*/1, Proto, 0> prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL, - args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args); + work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { ssize_t chunkOffset; - if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount; + if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount; chunkOffset = elemOffset + nvls->headRank * chunkCount; offset = gridOffset + chunkOffset; nelem = min(chunkCount, channelCount - chunkOffset); @@ -597,17 +595,15 @@ struct RunWorkElement -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - const int tid = threadIdx.x; - const int nthreads = args->nWarps*WARP_SIZE; - const int bid = args->bid; - const int nChannels = args->nChannels; +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + const int bid = ncclShmem.channelId - work->channelLo; + const int nChannels = work->channelHi - work->channelLo + 1; ncclTree *tree = &ncclShmem.channel.collnetChain; - ssize_t chunkSize = args->chunkCount; + ssize_t chunkSize = work->collnet.chunkCount; const ssize_t loopSize = int(nChannels*chunkSize); const int nranks = ncclShmem.comm.nRanks; - const ssize_t size = args->count; + const ssize_t size = work->collnet.count; int nthreadsSplit = nthreads/2; if (nthreadsSplit >= 256) nthreadsSplit += 64; @@ -634,7 +630,7 @@ struct RunWorkElementregUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (groupTid == 0) { int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, steps); @@ -642,8 +638,8 @@ struct RunWorkElement, /*Direct=*/1, Proto, 0> - prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, - args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); + prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, + work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * int(chunkSize); int nelem = min(chunkSize, size - offset); @@ -652,8 +648,8 @@ struct RunWorkElement, /*Direct=*/1, Proto, 0> - prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, - args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); + prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, + work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * int(chunkSize); int nelem = min(chunkSize, size - offset); @@ -665,7 +661,7 @@ struct RunWorkElementregUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (groupTid == 0) { int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, steps); @@ -673,8 +669,8 @@ struct RunWorkElement, /*Direct=*/1, Proto, 0> - prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, - args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); + prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, + work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * int(chunkSize); int nelem = min(chunkSize, size - offset); @@ -683,8 +679,8 @@ struct RunWorkElement, /*Direct=*/1, Proto, 0> - prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, - args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); + prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, + work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * int(chunkSize); int nelem = min(chunkSize, size - offset); @@ -693,8 +689,8 @@ struct RunWorkElement, /*Direct=*/1, Proto, 0> - prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, - args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); + prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff, + work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex); if (send == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); @@ -714,29 +710,29 @@ struct RunWorkElement -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - runRing(args); +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - runTreeSplit(args); +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + runTreeSplit(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - runRing(args); +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - runTreeSplit(args); +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + runTreeSplit(tid, nthreads, work); } }; diff --git a/src/device/broadcast.h b/src/device/broadcast.h index 86d45e77e..7026adc3d 100644 --- a/src/device/broadcast.h +++ b/src/device/broadcast.h @@ -10,23 +10,22 @@ namespace { template - __device__ __forceinline__ void runRing(ncclWorkElem *args) { - const int tid = threadIdx.x; - const int nthreads = (int)args->nWarps * WARP_SIZE; + __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclRing *ring = &ncclShmem.channel.ring; const int rank = ring->userRanks[0]; const int nextRank = ring->userRanks[1]; - const int root = args->root; - const size_t chunkCount = args->chunkCount; - const size_t channelCount = args->workCount; - const size_t gridOffset = args->workOffset; + const int root = work->root; + size_t chunkCount; + size_t channelCount; + size_t gridOffset; + ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount); size_t offset; int nelem; - T *inputBuf = (T*)args->sendbuff; - T *outputBuf = (T*)args->recvbuff; + T *inputBuf = (T*)work->sendbuff; + T *outputBuf = (T*)work->recvbuff; Primitives, 0, Proto, 0> - prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg); + prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; @@ -48,23 +47,23 @@ namespace { } template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { using Proto = ProtoSimple; - runRing(args); + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - runRing(args); +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - runRing(args); +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + runRing(tid, nthreads, work); } }; diff --git a/src/device/common.cu b/src/device/common.cu index d1b6acd1b..a8b5ed571 100644 --- a/src/device/common.cu +++ b/src/device/common.cu @@ -14,11 +14,11 @@ __shared__ ncclShmemData ncclShmem; #endif struct RunWorkNop { - __device__ void run(ncclWork *w) {} + __device__ void run() {} }; -__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { - ncclKernelMain<-1, RunWorkNop>(comm, channelMask, workHead); +__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { + ncclKernelMain<-1, RunWorkNop>(&args4K.args); } __device__ void ncclDevFunc_Nop() {} diff --git a/src/device/common.h b/src/device/common.h index d8581d3f4..5fa7be9ce 100644 --- a/src/device/common.h +++ b/src/device/common.h @@ -10,10 +10,19 @@ #include "collectives.h" #include "device.h" #include "op128.h" +#include "reduce_kernel.h" #include "network/unpack/unpack_defs.h" #define COLL_UNROLL (ncclCollUnroll()) +#if __CUDA_ARCH__ >= 700 +// __grid_constant__ appears to break cuda-gdb +//#define NCCL_GRID_CONSTANT __grid_constant__ +#define NCCL_GRID_CONSTANT +#else +#define NCCL_GRID_CONSTANT +#endif + typedef void(*ncclDevFuncPtr_t)(); extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[]; @@ -31,18 +40,28 @@ struct ncclShmemGroup { }; struct ncclShmemData { - struct ncclShmemGroup groups[NCCL_MAX_GROUPS]; - uint64_t redOpArgs[NCCL_MAX_ARITY+1]; + struct ncclDevKernelArgs args; int channelId; int aborted; alignas(16) struct ncclDevComm comm; alignas(16) struct ncclDevChannel channel; - alignas(16) struct ncclWork work; + + int batchIx, nextBatchIx; + enum ncclDevWorkType workType; + uint8_t directMode; + uint16_t funcId; + int nWorks; + int workSize; + uint32_t workConsumed; + struct ncclShmemGroup groups[NCCL_MAX_GROUPS]; + uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1]; + + alignas(16) char workStorage[1024]; + alignas(16) union { unpackShmem unpack; } devicePlugin; }; -static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned"); extern __shared__ ncclShmemData ncclShmem; #if __CUDA_ARCH__ >= 700 @@ -55,14 +74,62 @@ __device__ inline void* ncclScratchForWarp(int warp) { return (char*)ncclShmemPerWarp + warp*ncclShmemScratchWarpSize(); } -__device__ inline bool barrierReduceAny(int bit) { - uint32_t popc; - asm ("{" - ".reg .pred barr_pred;" - "setp.eq.u32 barr_pred, %1, 1;" - "bar.red.popc.u32 %0, 2, barr_pred;" - "}" : "=r"(popc) : "r"(bit)); - return popc != 0; +__device__ inline void barrier_sync(int name) { + #if 0 + asm volatile("barrier.sync %0;" :: "r"(name) : "memory"); + #else + asm volatile("barrier.sync.aligned %0;" :: "r"(name) : "memory"); + #endif +} +__device__ inline void barrier_sync(int name, int nThreads) { + #if 0 + asm volatile("barrier.sync %0, %1;" :: "r"(name), "r"(nThreads) : "memory"); + #else + asm volatile("barrier.sync.aligned %0, %1;" :: "r"(name), "r"(nThreads) : "memory"); + #endif +} +__device__ inline void barrier_sync_aligned(int name) { + asm volatile("barrier.sync.aligned %0;" :: "r"(name) : "memory"); +} +__device__ inline void barrier_sync_aligned(int name, int nThreads) { + asm volatile("barrier.sync.aligned %0, %1;" :: "r"(name), "r"(nThreads) : "memory"); +} + +__device__ inline bool barrier_red_or(bool vote, int name) { + int ans; + asm("{ .reg .pred p;" + " setp.ne.s32 p, %1, 0;" + " barrier.red.or.pred p, %2, p; " + " selp.s32 %0, 1, 0, p; }" + : "=r"(ans) : "r"((int)vote), "r"(name) : "memory"); + return bool(ans); +} +__device__ inline bool barrier_red_or(bool vote, int name, int nThreads) { + int ans; + asm("{ .reg .pred p;" + " setp.ne.s32 p, %1, 0;" + " barrier.red.or.pred p, %2, %3, p; " + " selp.s32 %0, 1, 0, p; }" + : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory"); + return bool(ans); +} +__device__ inline bool barrier_red_or_aligned(bool vote, int name) { + int ans; + asm("{ .reg .pred p;" + " setp.ne.s32 p, %1, 0;" + " barrier.red.or.pred.aligned p, %2, p; " + " selp.s32 %0, 1, 0, p; }" + : "=r"(ans) : "r"((int)vote), "r"(name) : "memory"); + return bool(ans); +} +__device__ inline bool barrier_red_or_aligned(bool vote, int name, int nThreads) { + int ans; + asm("{ .reg .pred p;" + " setp.ne.s32 p, %1, 0;" + " barrier.red.or.pred.aligned p, %2, %3, p; " + " selp.s32 %0, 1, 0, p; }" + : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory"); + return bool(ans); } // Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads. @@ -71,158 +138,261 @@ inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int by if (offset < bytes) { uint64_t a=0, b=0; asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset)); - asm volatile("st.v2.u64 [%0],{%1,%2};" :: "l"((char*)dst + offset), "l"(a), "l"(b)); + uint32_t udst = (uint32_t)__cvta_generic_to_shared(dst); + asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b)); + } +} + +// Must run with at least 64 threads +__device__ __forceinline__ void loadWorkBatchToShmem( + int tid, int tn, struct ncclDevKernelArgs const* args, int batchIx + ) { + int lane = tid%WARP_SIZE; + int workCursor = 0; // num works written in previous loop iterations. + while (true) { + struct ncclDevWorkBatch batch = ((struct ncclDevWorkBatch*)(args+1))[batchIx]; + + // fnsOfBitset[n] = index of n'th set bit in batch.offsetBitset. + // PTX has instruction "fns" (find n-th set) but it expands to a lot of SASS, + // since we know all lanes will be querying the same bitmask we can compute + // much faster using shared memory. + uint8_t* fnsOfBitset = (uint8_t*)ncclScratchForWarp(threadIdx.x/WARP_SIZE); + __syncwarp(); + if (uint32_t(batch.offsetBitset) & (1u<>32) & (1u<>32) & ((1u<>32)); // add high 32 bits + __syncwarp(); + + int workSize; + int nPacks; // total number of packs loaded, each pack is 16 bytes + int packInWork; // my pack index within work struct + int dstWork; // my work index in contiguous destination shmem + switch (batch.workType) { + case (int)ncclDevWorkTypeP2p: + workSize = sizeof(struct ncclDevWorkP2p); + nPacks = nWorks*(workSize/16); + packInWork = tid%(workSize/16); + dstWork = tid/(workSize/16); + break; + case (int)ncclDevWorkTypeColl: + workSize = sizeof(struct ncclDevWorkColl); + nPacks = nWorks*(workSize/16); + packInWork = tid%(workSize/16); + dstWork = tid/(workSize/16); + break; + case (int)ncclDevWorkTypeCollReg: + default: + workSize = sizeof(struct ncclDevWorkCollReg); + nPacks = nWorks*(workSize/16); + packInWork = tid%(workSize/16); + dstWork = tid/(workSize/16); + break; + } + if (tid == 0) { + ncclShmem.workSize = workSize; + ncclShmem.workConsumed = batch.offsetBase + (64-__clzll(batch.offsetBitset))*workSize; + } + // We deliberately replicate these div and mod calculations into the case + // blocks above so that they get constant divisor optimizations by the compiler. + // packInWork = tid%(workSize/16); + // dstWork = tid/(workSize/16); + + // We can only assume we have 64 threads, which means we can read at most 1024 bytes + // here which is the per batch maximum. + if (tid < nPacks) { + int srcWork = fnsOfBitset[dstWork]; // find n'th set bit in batch.offsetBitset + ulong2 tmp; + // The loads done in these two cases must be kept separate since we are + // relying on the compiler to use "ld.param" in the first one. The parameter + // space is not generically addressable, so any attempt to load through + // a pointer that *might* be parameter space backed will cause the + // compiler to spill the parameter struct (4K!) to each thread's local space + // before creating a pointer (to the spill) and decimate perf. + // + // An example of what not to do would be the following: + // + // if (condition) { + // // The compiler could spill parameter_variable to local space and take + // // the address of that, since when src is loaded below it could also + // // be global space. + // src = ¶meter_variable; + // } else { + // src = &global_variable; + // } + // memcpy(dst, src, n); + if (ncclShmem.args.workStorageType == ncclDevWorkStorageTypeArgs) { + char* src = (char*)args + (batch.offsetBase + srcWork*workSize + packInWork*16); + tmp = *(ulong2*)src; // becomes ld.param.v2.u64 + } else { + char* src = (char*)ncclShmem.args.workBuf + ((batch.offsetBase + srcWork*workSize + packInWork*16) & ncclShmem.args.workMask); + tmp = *(ulong2*)src; // becomes ld.v2.u64 + } + char* dst = ncclShmem.workStorage; + dst += (workCursor + dstWork)*workSize + packInWork*16; + *(ulong2*)dst = tmp; + } + workCursor += nWorks; + + if (batch.nextExtends) { + batchIx += batch.nextJump; + tid -= 64; // Rotate threads so we use the next two warps for next batch struct. + if (tid < 0) tid += tn; + } else { + if (tid == 0) { + ncclShmem.batchIx = batchIx; + ncclShmem.nextBatchIx = (batch.nextJump == 0) ? -1 : batchIx + batch.nextJump; + ncclShmem.workType = (enum ncclDevWorkType)batch.workType; + ncclShmem.nWorks = workCursor; + ncclShmem.funcId = batch.funcId; + } + break; + } } } template -struct RunWorkElement { - __device__ void run(ncclWorkElem*) { +struct RunWorkColl { + __device__ void run(int tid, int tn, struct ncclDevWorkColl* work) { // Put NOT IMPLEMENTED behavior here. } }; template -struct RunWork { +struct RunWorkBatch; + +// Specialized for P2p in sendrecv.h +template +struct RunWorkBatch; + +// Specialized here for non-P2p (Coll and CollReg) +template +struct RunWorkBatch { // This __forceinline__ is necessary. The compiler was inserting a function call // here from the LL ncclKernel. - __device__ __forceinline__ void run(ncclWork *w) { - int wid = threadIdx.x / WARP_SIZE; - ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0]; - int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem); + __device__ __forceinline__ void run() { + int tid = threadIdx.x; + int tn = blockDim.x; + + if (RedOpArg::ArgUsed) { + int nWorks = ncclShmem.nWorks; + for (int w=tid; w < nWorks; w += tn) { + struct ncclDevWorkColl* work = (ncclDevWorkColl*)(ncclShmem.workStorage + w*ncclShmem.workSize); + if (work->redOpArgIsPtr) { + work->redOpArg = RedOpArg::loadArg(reinterpret_cast(work->redOpArg)); + } + } + __syncthreads(); + } + #pragma unroll 1 - while ((char*)we + stride <= (char*)(w+1) && we->isUsed) { - if (wid < we->nWarps) { - RunWorkElement().run(we); + for (int w=0; w < ncclShmem.nWorks; w++) { + struct ncclDevWorkColl* work = (struct ncclDevWorkColl*)(ncclShmem.workStorage + w*ncclShmem.workSize); + if (w != 0) { + struct ncclDevWorkColl* workPrev = (struct ncclDevWorkColl*)(ncclShmem.workStorage + (w-1)*ncclShmem.workSize); + if (work->nWarps != workPrev->nWarps) __syncthreads(); } - we = (ncclWorkElem*)((char*)we + stride); + int subtn = work->nWarps*WARP_SIZE; + if (tid < subtn) RunWorkColl().run(tid, subtn, work); } } }; -static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) { - if (we->isUsed && we->redOpArgIsPtr) { - /* redOpArg is a pointer to the scalar value, so we'll dereference it - * here so that redOpArg holds the bits of the scalar going forward. - * The tricky thing is we don't know its type T since that's encoded in - * the funcIndex. Because it would be difficult to get sizeof(T) from - * funcIndex, we'll cheat and just dereference the largest possible size - * given the alignment of the pointer. We might be reading in more bytes - * than we need but that's harmless. - */ - if (we->redOpArg%2 != 0) - we->redOpArg = *reinterpret_cast(we->redOpArg); - else if (we->redOpArg%4 != 0) - we->redOpArg = *reinterpret_cast(we->redOpArg); - else if (we->redOpArg%8 != 0) - we->redOpArg = *reinterpret_cast(we->redOpArg); - else - we->redOpArg = *reinterpret_cast(we->redOpArg); - } -} - -template -__device__ void ncclKernelMain(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { +template +__device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* args) { int tid = threadIdx.x; + int tn = blockDim.x; + + // Copy kernel args to shmem and then only read those. Otherwise the compiler + // will end up putting the args into thread local stack which is very wasteful. + if (tid < sizeof(ncclDevKernelArgs)/sizeof(uint32_t)) { + ((uint32_t*)&ncclShmem.args)[tid] = ((uint32_t*)args)[tid]; + } // To map blockId to channelId, we need the n'th set bit of channelMask which // is the inverse of counting the number of set bits among the the first n. - if (tid < WARP_SIZE) { - int x = tid; - if (channelMask & (1ull<channelMask & (1ull<channelMask & ((1ull<channels[channelId]; - bytes = sizeof(ncclDevChannel); + copyToShmem16(tid, dst, src, bytes); + } break; + case 1: + { // Get address of channel without incurring indirect load from ncclDevComm::channels + void* dst = &ncclShmem.channel; + void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId]; + int bytes = sizeof(ncclDevChannel); static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn."); - break; - case 2: - dst = &ncclShmem.work; - src = workHead + blockIdx.x; - bytes = sizeof(ncclWork); - static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn."); - break; - default: - bytes = 0; - break; - } - if (bytes) copyToShmem16(tid%WARP_SIZE, dst, src, bytes); + copyToShmem16(tid-WARP_SIZE, dst, src, bytes); + } break; + default: + { int subtid = tid - 2*WARP_SIZE; + int subtn = tn - 2*WARP_SIZE; + loadWorkBatchToShmem(subtid, subtn, args, /*batchIx=*/blockIdx.x); + } break; } __syncthreads(); // publish ncclShmem - while (true) { - // Notify host that all fifo reads are complete. - if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) { - *ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks; - } - - __syncwarp(); - if (ncclShmem.work.header.type == ncclWorkTypeColl) { - if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]); - } else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) { - if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem); - } - __syncthreads(); + if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) { + // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads() + ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed; + } - if (0 <= SpecializedFnId && ncclShmem.work.header.funcIndex == (unsigned)SpecializedFnId) { - SpecializedRunWork().run(&ncclShmem.work); + while (true) { + if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) { + SpecializedRunWorkBatch().run(); } else { - ncclDevFuncTable[ncclShmem.work.header.funcIndex](); + ncclDevFuncTable[ncclShmem.funcId](); } - int workIxNext = ncclShmem.work.header.workNext; + if (ncclShmem.nextBatchIx == -1) break; + int batchIx = ncclShmem.nextBatchIx; __syncthreads(); - if (ncclShmem.work.header.isLast) break; - - copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork)); + loadWorkBatchToShmem(tid, tn, args, batchIx); - { // Check whether the last operation was aborted and make sure all threads exit - int aborted = tid == 0 ? *comm->abortFlag : 0; - if (barrierReduceAny(aborted)) // publish ncclShmem.work - break; + // Check whether the last operation was aborted and make sure all threads exit + bool aborted = false; + if (tid == 0) aborted = *ncclShmem.comm.abortFlag; + aborted = barrier_red_or_aligned(aborted, 0); // publish ncclShmem.work + if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) { + // ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or() + ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed; } + if (aborted) break; } } -__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); +__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); __device__ void ncclDevFunc_Nop(); #define DEFINE_ncclDevKernel(suffix, coll, redop, ty, algo, proto, specializedFnId) \ - __global__ void ncclDevKernel_##suffix(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \ - ncclKernelMain, algo, proto>>(comm, channelMask, workHead); \ + __global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { \ + ncclKernelMain, algo, proto>>(&args4K.args); \ } #define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \ __device__ void ncclDevFunc_##suffix() { \ - RunWork, algo, proto>().run(&ncclShmem.work); \ + RunWorkBatch, algo, proto>().run(); \ } #endif diff --git a/src/device/generate.py b/src/device/generate.py index 43de85d61..d0feee10f 100755 --- a/src/device/generate.py +++ b/src/device/generate.py @@ -233,6 +233,8 @@ def validate(coll, redop, ty, algo, proto): out('#include "device.h"\n') out("\n") + out("extern int const ncclDevFuncIdCount = %d;\n" % len(primary_funcs)) + # The mapping from function rows to valid primary function ids. out("extern int const ncclDevFuncRowToId[] = {\n") index = 0 @@ -251,7 +253,7 @@ def validate(coll, redop, ty, algo, proto): cudart, _ = required_cuda(*kfn) sym = paste("_", "ncclDevKernel", *kfn) if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart) - out("__global__ void %s(struct ncclDevComm*, uint64_t, struct ncclWork*);\n" % sym) + out("__global__ void %s(ncclDevKernelArgs4K const);\n" % sym) if cudart != 0: out("#endif\n") out("\n") diff --git a/src/device/network/unpack/unpack.h b/src/device/network/unpack/unpack.h index 3bc910047..b213fbe39 100644 --- a/src/device/network/unpack/unpack.h +++ b/src/device/network/unpack/unpack.h @@ -10,7 +10,7 @@ #include "unpack_defs.h" #include "op128.h" -#include "align.h" +#include "bitops.h" #include "device.h" #include "common.h" @@ -35,16 +35,16 @@ inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group, struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle; ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta; ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf; - ncclShmem.groups[group].devicePlugin.unpack.head = handle->head; + ncclShmem.groups[group].devicePlugin.unpack.head[index] = handle->head; } -inline __device__ void ncclNetDeviceIncrementHead(const int group) { - ncclShmem.groups[group].devicePlugin.unpack.head++; +inline __device__ void ncclNetDeviceIncrementHead(const int group, const int index) { + ncclShmem.groups[group].devicePlugin.unpack.head[index]++; } -inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group) { +inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group, const int index) { struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle; - handle->head = ncclShmem.groups[group].devicePlugin.unpack.head; + handle->head = ncclShmem.groups[group].devicePlugin.unpack.head[index]; } template @@ -183,7 +183,7 @@ inline __device__ void ncclNetDeviceUnpack( // Pack data from the internal iovec to the supplied flat srcs buffer using all the threads // + Src is necessary in the case of accessing the user buffer directly ncclNetDeviceUnpackInner(tid, tidInBlock, nworkers, group /* in case they need to use split warps shared memory partitioning*/, - ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head); + ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head[ix]); } } diff --git a/src/device/network/unpack/unpack_defs.h b/src/device/network/unpack/unpack_defs.h index 9be1c5e42..ecbed01fe 100644 --- a/src/device/network/unpack/unpack_defs.h +++ b/src/device/network/unpack/unpack_defs.h @@ -54,7 +54,7 @@ struct unpackShmem { struct unpackGroupShmem { int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv - uint64_t head; + uint64_t head[NET_UNPACK_MAX_NPEERS]; struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy }; diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h index 5f5969099..4a6f9e267 100644 --- a/src/device/prims_ll.h +++ b/src/device/prims_ll.h @@ -44,10 +44,11 @@ class Primitives: inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } inline __device__ void barrier() { - if (nthreads == WARP_SIZE) + if (nthreads == WARP_SIZE) { __syncwarp(); - else - asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group)); + } else { + barrier_sync(15-group, nthreads); + } } uint32_t abort = 0; @@ -323,7 +324,8 @@ class Primitives: __device__ Primitives( const int tid, const int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, - uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0 + uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, + bool userBufReg=false, int stepSize_=0 ): redOp(redOpArg), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group), diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h index 698eea68e..9c7169545 100644 --- a/src/device/prims_ll128.h +++ b/src/device/prims_ll128.h @@ -50,7 +50,7 @@ class Primitives: inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; } inline __device__ void barrier() { - asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group)); + barrier_sync(15-group, nthreads); } uint32_t abort = 0; @@ -364,7 +364,8 @@ class Primitives: __device__ Primitives( const int tid, const int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, - uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0 + uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr, + bool userBufReg=false, int stepSize_=0 ): redOp(redOpArg), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h index 2431c2fdd..c02657038 100644 --- a/src/device/prims_simple.h +++ b/src/device/prims_simple.h @@ -23,7 +23,7 @@ class Primitives< ConnFifoEnabled = 0x100, DirectWrite = 0x200, DirectRead = 0x400, - ThreadsSynced = 0x800, + // 0x800 is free to use NvlsMinPolling = 0x1000, NetDeviceUnpack = 0x2000, AnyNetDeviceUnpack = 0x4000, @@ -44,53 +44,38 @@ class Primitives< uint64_t *connStepPtr; uint64_t connStepCache; // Cache last seen value of (*connStepPtr) int connStepSize; // Connection step size - void* mhandle; void* netDeviceHandle; // Don't use barrier 0 as it's used by the final sync __device__ void barrier() { - flags |= ThreadsSynced; if (nthreads == WARP_SIZE) __syncwarp(); else { int bar = 15-group; - asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nthreads) : "memory"); + barrier_sync(bar, nthreads); } } __device__ void subBarrier() { if (nworkers == WARP_SIZE) __syncwarp(); else { - int bar = (nworkers==nthreads ? 15 : 8) - group; - asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nworkers) : "memory"); + int bar = 15-group - (nworkers!=nthreads ? 1 : 0); + barrier_sync(bar, nworkers); } } __device__ bool barrierAny(int vote) { - flags |= ThreadsSynced; if (nthreads == WARP_SIZE) { return __any_sync(~0u, vote); } else { - int ans, bar = 15-group; - asm volatile( - "{ .reg .pred p;" - " setp.ne.s32 p, %1, 0;" - " bar.red.or.pred p, %2, %3, p; " - " selp.s32 %0, 1, 0, p; }" - : "=r"(ans) : "r"(vote), "r"(bar), "r"(nthreads) : "memory"); - return ans != 0; + int name = 15-group; + return barrier_red_or(vote, name, nthreads); } } __device__ bool subBarrierAny(int vote) { if (nworkers == WARP_SIZE) { return __any_sync(~0u, vote); } else { - int ans, bar = (nworkers==nthreads ? 15 : 8) - group; - asm volatile( - "{ .reg .pred p;" - " setp.ne.s32 p, %1, 0;" - " bar.red.or.pred p, %2, %3, p; " - " selp.s32 %0, 1, 0, p; }" - : "=r"(ans) : "r"(vote), "r"(bar), "r"(nworkers) : "memory"); - return ans != 0; + int name = 15-group - (nworkers!=nthreads ? 1 : 0); + return barrier_red_or(vote, name, nworkers); } } @@ -164,8 +149,8 @@ class Primitives< else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize; } - if ((flags & (AnyNetDeviceUnpack)) && (flags & (Recv*RoleWaitRecv))) { - ncclNetDeviceIncrementHead(group); + if (flags & NetDeviceUnpack) { + ncclNetDeviceIncrementHead(group, index); } step += StepPerSlice; } @@ -436,7 +421,7 @@ class Primitives< } } - __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) { + __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) { if (flags & (RoleWaitRecv|RolePostRecv)) { auto *conn = &peer->recv[connIndex]; if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) { @@ -488,7 +473,7 @@ class Primitives< } } - __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) { + __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) { if (flags & (RoleWaitSend|RolePostSend)) { auto *conn = &peer->send[connIndex]; step = conn->step; @@ -538,13 +523,13 @@ class Primitives< __device__ Primitives( int tid, int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, - uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0 + uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,bool userBufReg=false, int stepSize_=0 ): tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group), stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) { // For send operations, we need an extra warp to overlap the threadfence and the copy - this->nworkers = nthreads - (MaxSend > 0 && nthreads-WARP_SIZE >= 64 ? WARP_SIZE : 0); + this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0); int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++; @@ -572,7 +557,7 @@ class Primitives< loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e); loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e); - if (p2p && p2p->reg) flags |= UserBufferMode; + if (userBufReg) flags |= UserBufferMode; if (barrierAny(flags & NetDeviceUnpack)) { flags |= AnyNetDeviceUnpack; @@ -584,13 +569,12 @@ class Primitives< } } - setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e); + setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e); } __device__ ~Primitives() { // Ensure ncclShmem.groups[].send/recvConns are available - if (!(flags & ThreadsSynced)) - barrier(); + barrier(); // Save steps for the next operation if (flags & (RolePostSend|RolePostRecv)) { auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns; @@ -606,8 +590,8 @@ class Primitives< while (*ptr != -1) if (checkAbort(spins)) break; } - if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) { - ncclNetDeviceSaveHead(netDeviceHandle, group); + if (flags & NetDeviceUnpack) { + ncclNetDeviceSaveHead(netDeviceHandle, group, index); } // Make sure all threads are done writing back conn->step and done using @@ -615,7 +599,7 @@ class Primitives< barrier(); } - __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) { + __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* e) { if (tid==0) { ncclShmem.groups[group].userInput = (void*)inputBuf; ncclShmem.groups[group].userOutput = (void*)outputBuf; @@ -625,7 +609,7 @@ class Primitives< bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite)); bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched) bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer - int regUsed = e != nullptr ? e->elem.regUsed : 0; + int regUsed = e != nullptr ? e->coll.regUsed : 0; if (Direct && recvProvider) { int spins = 0; diff --git a/src/device/reduce.h b/src/device/reduce.h index 43cae213b..91cdaeb25 100644 --- a/src/device/reduce.h +++ b/src/device/reduce.h @@ -10,22 +10,21 @@ namespace { template - __device__ __forceinline__ void runRing(ncclWorkElem *args) { - const int tid = threadIdx.x; - const int nthreads = (int)args->nWarps * WARP_SIZE; + __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclRing *ring = &ncclShmem.channel.ring; const int nranks = ncclShmem.comm.nRanks; const int rank = ncclShmem.comm.rank; const int prevRank = ring->userRanks[nranks-1]; - const int root = args->root; - const size_t chunkCount = args->chunkCount; - const size_t channelCount = args->workCount; - const size_t gridOffset = args->workOffset; + const int root = work->root; + size_t chunkCount; + size_t channelCount; + size_t gridOffset; + ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount); size_t offset; int nelem; Primitives, 0, Proto, 0> - prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg); + prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg); if (prevRank == root) { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { @@ -52,23 +51,23 @@ namespace { } template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { using Proto = ProtoSimple; - runRing(args); + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - runRing(args); +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - runRing(args); +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + runRing(tid, nthreads, work); } }; diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h index cbf774338..9e78da98a 100644 --- a/src/device/reduce_kernel.h +++ b/src/device/reduce_kernel.h @@ -37,6 +37,7 @@ template struct FuncSum { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; }; template struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; }; + template struct FuncMinMax { using EltType = T; @@ -47,9 +48,30 @@ struct FuncMinMax { isMinNotMax = (opArg&1)==0; } }; + template struct FuncPreMulSum; template struct FuncSumPostDiv; +//////////////////////////////////////////////////////////////////////////////// +// Trait class for handling the reduction argument. + +template +struct RedOpArg { // default case: no argument + static constexpr bool ArgUsed = false; + __device__ static uint64_t loadArg(void *ptr) { return 0; } +}; + +template +struct RedOpArg> { + static constexpr bool ArgUsed = true; + __device__ static uint64_t loadArg(void *ptr) { + union { uint64_t u64; T val; }; + u64 = 0; + val = *(T*)ptr; + return u64; + } +}; + //////////////////////////////////////////////////////////////////////////////// // Trait classes for reduction functions. Given a function (FuncSum, etc.) // and a number of elements in a pack, will reduce, preOp, or postOp a pack @@ -356,6 +378,17 @@ struct Apply_PostOp { //////////////////////////////////////////////////////////////////////////////// // FuncPreMulSum +template +struct RedOpArg> { + static constexpr bool ArgUsed = true; + __device__ static uint64_t loadArg(void *ptr) { + union { uint64_t u64; T val; }; + u64 = 0; + val = *(T*)ptr; + return u64; + } +}; + // General definition for all integral types, float, and double. template struct FuncPreMulSum { @@ -486,6 +519,14 @@ struct Apply_PreOp, /*EltPerPack=*/1> { //////////////////////////////////////////////////////////////////////////////// // FuncSumPostDiv +template +struct RedOpArg> { + static constexpr bool ArgUsed = true; + __device__ static uint64_t loadArg(void *ptr) { + return *(uint64_t*)ptr; + } +}; + template::value> struct FuncSumPostDiv_IntOnly; @@ -658,7 +699,7 @@ struct Apply_LoadMultimem { static constexpr bool IsFloat = IsFloatingPoint::value; static constexpr int BigPackSize = IsFloat && IsSum && sizeof(T) < 8 ? 16 : - IsFloat && IsSum ? 8 : + IsFloat && IsSum ? sizeof(T) : IsFloat && IsMinMax && sizeof(T)==2 ? 16 : !IsFloat && (IsSum||IsMinMax) && sizeof(T)>=4 ? sizeof(T) : /*multimem.ld_reduce not supported:*/ 0; diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h index d0b52494e..cf068ff55 100644 --- a/src/device/reduce_scatter.h +++ b/src/device/reduce_scatter.h @@ -10,23 +10,22 @@ namespace { template - __device__ __forceinline__ void runRing(ncclWorkElem *args) { - const int tid = threadIdx.x; - const uint32_t nthreads = (uint32_t)args->nWarps * WARP_SIZE; + __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclRing *ring = &ncclShmem.channel.ring; int const *ringRanks = ring->userRanks; - const size_t chunkCount = args->chunkCount; const int nranks = ncclShmem.comm.nRanks; - size_t channelCount = args->workCount; - size_t gridOffset = args->workOffset; + size_t count; + size_t gridOffset; + size_t channelCount; + size_t chunkCount; + ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount); size_t offset; size_t dataOffset; - size_t count = args->count; uint32_t nelem; int rankDest; Primitives, 0, Proto, 0> - prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg); + prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { nelem = min(chunkCount, channelCount - elemOffset); @@ -54,56 +53,56 @@ namespace { } template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { using Proto = ProtoSimple; - runRing(args); + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - runRing(args); +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - runRing(args); +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + runRing(tid, nthreads, work); } }; template -struct RunWorkElement { - __device__ __forceinline__ void run(ncclWorkElem *args) { - const int tid = threadIdx.x; +struct RunWorkColl { + __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { struct ncclNvls* nvls = &ncclShmem.channel.nvls; - const size_t chunkCount = args->chunkCount; - const size_t count = args->count; + size_t count; + size_t gridOffset; + size_t channelCount; + size_t chunkCount; + ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount); const int rank = ncclShmem.comm.rank; const int nranks = ncclShmem.comm.nRanks; - size_t gridOffset = args->workOffset; - size_t channelCount = args->workCount; size_t offset; int nelem; /* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync; * if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth * and the rest are allocated to scatter. */ - const int nThreadsReduce = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE); - const int nThreadsScatter = args->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce); + const int nThreadsReduce = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE); + const int nThreadsScatter = work->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce); const int tidEndScatter = nThreadsScatter; const int tidEndReduce = tidEndScatter + nThreadsReduce; - if (!args->regUsed) { + if (!work->regUsed) { if (tid < tidEndScatter) { // Scatter using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, - args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL, + work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); @@ -113,8 +112,8 @@ struct RunWorkElement; Primitives, /*Direct=*/0, Proto, 0> - prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, - args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0); + prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, work->recvbuff, + work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); @@ -127,7 +126,7 @@ struct RunWorkElement; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL, - args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { prims.scatter(0, 0, 0, 0, -1, 0); } @@ -138,8 +137,8 @@ struct RunWorkElement; Primitives, /*Direct=*/1, Proto, 0> - prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, args->recvbuff, - args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args); + prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, work->recvbuff, + work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work); for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { size_t outOffset = gridOffset + elemOffset; size_t inpOffset = outOffset + rank * count; @@ -155,10 +154,10 @@ struct RunWorkElement -struct RunWorkElement { +struct RunWorkColl { template struct Scatterer { - struct ncclWorkElem* args; + struct ncclDevWorkColl* work; int chunkSize; ssize_t railGridOffset; @@ -173,11 +172,11 @@ struct RunWorkElementnHeads; - int bid = args->bid; - void* inbuf = (void*)args->sendbuff; - ssize_t sizePerRank = args->count; + int part = ncclShmem.channelId - work->channelLo; + void* inbuf = (void*)work->sendbuff; + ssize_t sizePerRank = work->collnet.count; - ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank); + ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank); ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank); int railAllSize = railAllEnd - railAllBeg; if (tid < nDsts) dstSizes[tid] = railAllSize; @@ -204,7 +203,7 @@ struct RunWorkElement - (tid, tn, args->redOpArg, &args->redOpArg, false, + (tid, tn, work->redOpArg, &work->redOpArg, false, /*nSrcs=*/1+nSrcs, [=]__device__(int s) { return s==0 ? (T*)inbuf + userOneBeg : (T*)srcPtrs[s-1] + railAllOffset; @@ -223,23 +222,23 @@ struct RunWorkElementnChannels; + __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { + const int part = ncclShmem.channelId - work->channelLo; + const int nChannels = work->channelHi - work->channelLo + 1; struct ncclDirect* direct = &ncclShmem.channel.collnetDirect; int const &nNodes = ncclShmem.comm.nNodes; - ssize_t chunkSize = int(args->chunkCount); - ssize_t sizePerRank = args->count; + ssize_t chunkSize = int(work->collnet.chunkCount); + ssize_t sizePerRank = work->collnet.count; if (direct->out == -1) __trap(); bool isMultiRail = (direct->nHeads > 1); int nWarps1 = (isMultiRail ? 2 : 0); int nWarps2 = (isMultiRail ? 2 : 1); int nWarps3 = 1; - float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3); + float denom = float(work->nWarps)/float(nWarps1+nWarps2+nWarps3); nWarps3 = int(denom*nWarps3); nWarps2 = int(denom*nWarps2); - nWarps1 = args->nWarps - (nWarps2+nWarps3); + nWarps1 = work->nWarps - (nWarps2+nWarps3); using Proto = ProtoSimple<1, 1>; @@ -248,13 +247,13 @@ struct RunWorkElement, /*Direct=*/0, Proto, 0> prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr, - args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1); + work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1); for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) { Scatterer scat; - scat.args = args; + scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; - prims.process(scat); + prims.template process(scat); } return; } @@ -262,7 +261,7 @@ struct RunWorkElementregUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (tid == 0) { int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps); @@ -272,13 +271,13 @@ struct RunWorkElement send to network Primitives, /*Direct=*/0, Proto, 0> prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr, - args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); + work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { Scatterer scat; - scat.args = args; + scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; - prims.process(scat); + prims.template process(scat); } } return; @@ -287,7 +286,7 @@ struct RunWorkElementregUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { if (tid == 0) { int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE); Primitives, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps); @@ -296,10 +295,10 @@ struct RunWorkElement, /*Direct=*/0, Proto, 0> - prims(tid, tn, &direct->out, nullptr, nullptr, args->recvbuff, - args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0); + prims(tid, tn, &direct->out, nullptr, nullptr, work->recvbuff, + work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0); for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { - ssize_t railAllBeg = railGridOffset + args->bid * chunkSize; + ssize_t railAllBeg = railGridOffset + part * chunkSize; ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank); ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank; ssize_t railOneEnd = railOneBeg + sizePerRank; diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h index 347ac78c5..7774202a1 100644 --- a/src/device/sendrecv.h +++ b/src/device/sendrecv.h @@ -9,83 +9,159 @@ #include "primitives.h" template -struct RunWork { +struct RunWorkBatch { + static_assert(sizeof(T)==1, "SendRecv only works on single byte types T."); + template - __device__ void runSend(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) { - void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); - ssize_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); - if (args->peer == ncclShmem.comm.rank) { - struct ncclWorkElemP2p* recvArgs = args-1; - void* recvBuff = reinterpret_cast(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32); - if (buff != recvBuff) { - reduceCopy - (tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count); - } - } else { - int chunkSize = args->chunkSize/sizeof(T); - if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; - int const peer = args->peer; - Primitives, 1, Proto, 1> prims - (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T)); - size_t offset = 0; - do { - int nelem = min(size_t(chunkSize), count-offset); - prims.directSend(offset, offset, nelem); - offset += nelem; - } while(offset < count && args->reg == 0); - } + __device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) { + size_t bytes = work->sendBytes; + int chunkSize = u32fp8Decode(work->sendChunkSize_u32fp8); + Primitives, 1, Proto, 1> + prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr, + /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, + /*userBufferMode=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize); + size_t cursor = 0; + do { + int n = min(size_t(chunkSize), bytes-cursor); + prims.directSend(cursor, cursor, n); + cursor += n; + } while (cursor < bytes && work->sendRegistered == 0); } template - __device__ void runRecv(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) { - if (args->peer != ncclShmem.comm.rank) { - void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); - ssize_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); - int chunkSize = args->chunkSize/sizeof(T); - if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize - int const peer = args->peer; - Primitives, 1, Proto, 1> prims - (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T)); - size_t offset = 0; - do { - int nelem = min(size_t(chunkSize), count-offset); - prims.directRecv(offset, nelem); - offset += nelem; - } while(offset < count && args->reg == 0); - } + __device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) { + size_t bytes = work->recvBytes; + int chunkSize = u32fp8Decode(work->recvChunkSize_u32fp8); + Primitives, 1, Proto, 1> + prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr, + /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, + /*userBufferMode=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize); + size_t cursor = 0; + do { + int n = min(size_t(chunkSize), bytes-cursor); + prims.directRecv(cursor, n); + cursor += n; + } while (cursor < bytes && work->recvRegistered == 0); } - __device__ __forceinline__ void run(ncclWork *work) { - struct ncclWorkElemP2p* args = work->p2pElems; - int ngroups = args->ngroups; - int tid = threadIdx.x; - int wid = tid / WARP_SIZE; - // This has to work even for groups of 2.5 warps (which is 8 groups, and means 3 - // warps for send, 2 warps for recv). - // warpStarts were rounded thanks to int division, but for group number we need to round the other way around - // So we mirror wid then mirror again the group. - #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE) - uint8_t group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS; - args += group; - tid -= args->warpStart * WARP_SIZE; - int nthreads = args->nWarps * WARP_SIZE; - - if (args->p2pType == ncclWorkP2pTypeUnused) return; - if (tid >= nthreads || args->peer == -1) return; - - // Select Proto here - // This is to allow the same kernel to run multiple primitives on different warps (thread groups) - if ((group%2) == 0) { - if (args->proto == NCCL_PROTO_LL) { - runRecv(tid, nthreads, group, args); + __device__ __forceinline__ void run() { + const int tid = threadIdx.x; + const int tn = blockDim.x; + const int wid = tid/WARP_SIZE; + const int nWarps = tn/WARP_SIZE; + const int lane = tid%WARP_SIZE; + + struct Shared { + uint32_t workSendMask; // bitmasks of which work indices have send/recv + uint32_t workRecvMask; + }; + Shared* shared = (Shared*)ncclScratchForWarp(0); + + struct ncclDevWorkP2p* works = (ncclDevWorkP2p*)ncclShmem.workStorage; + int nWorks = ncclShmem.nWorks; + + if (wid == 0) { + // Modify the memory range of each work[] to reflect this channel's + // partition of the work. Since integer divides are very heavy it's + // best to do them all in one warp. + int workIx = lane%16; + int isSend = lane < 16 ? 0 : 1; + bool hasWork = false; + if (workIx < nWorks) { + struct ncclDevWorkP2p* work = &works[workIx]; + size_t bytes = isSend ? work->sendBytes : work->recvBytes; + int nParts = isSend ? work->nSendChannels : work->nRecvChannels; + int part = ncclP2pChannelToPart(work->nP2pChannels, work->channelBase, ncclShmem.channelId); + hasWork = (part < nParts); + if (nParts != 0) { + size_t partBeg, partEnd; + ncclP2pPartBounds(nParts, part, bytes, &partBeg, &partEnd); + (isSend ? work->sendAddr : work->recvAddr) = (char*)(isSend ? work->sendAddr : work->recvAddr) + partBeg; + (isSend ? work->sendBytes : work->recvBytes) = partEnd - partBeg; + } + } + uint32_t mask = __ballot_sync(~0u, hasWork); + if (lane == 0) { + shared->workSendMask = mask>>16; + shared->workRecvMask = mask & 0xffff; + } + } + + // The fastest way to compute a warp uniform division x/y in [0,32) is to + // use each lane to guess a solution and count the ones that don't exceed + // the numerator: + // __popc(__ballot_sync(~0u, y*(lane+1) <= x)) + // That takes 1/3 the time of standard division and about 3/4 the time of + // approximate floating point division: + // __float2int_rd(__fdividef(float(x),float(y))). + + // nWarpPerWork = nWarps/nWorks + int nWarpPerWork = __popc(__ballot_sync(~0u, nWorks*(lane+1) <= nWarps)); + int nRecvWarpPerWork = nWarpPerWork<=4 ? nWarpPerWork/2 : (nWarpPerWork-1)/2; + int nSendWarpPerWork = nWarpPerWork<=4 ? nRecvWarpPerWork : nRecvWarpPerWork+1; + // This might reduce nWarpPerWork which is probably desirable. It is better + // to have a balanced number of reading and writing threads even if that + // leaves warps unused. + nWarpPerWork = nSendWarpPerWork + nRecvWarpPerWork; + // The work index this warp belongs to: workIx = wid/nWarpPerWork + int workIx = __popc(__ballot_sync(~0u, (lane+1)*nWarpPerWork <= wid)); + + __syncthreads(); // Wait for works[] and shared->* to be updated by warp=0 + + uint32_t workSendMask = shared->workSendMask; + uint32_t workRecvMask = shared->workRecvMask; + + __syncthreads(); // release scratch space used by shared->* + if (nWorks <= workIx) return; + + // Thread range for whole work (send & recv combined) + int subtid = tid - workIx*nWarpPerWork*WARP_SIZE; + int subtn = nWarpPerWork*WARP_SIZE; + + // A send primtive of sufficient size requires 2 cuda barrier ids. + constexpr int nSendWarpsForExtraGroup = NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE/WARP_SIZE; + // Count up all group ids used below this workIx: + int group, extra; + // Each recv gets one group id: + group = __popc(workRecvMask & ((1<= nSendWarpsForExtraGroup) ? 1 : 0; + group += __popc((workSendMask & workRecvMask) & ((1<= nSendWarpsForExtraGroup) ? 1 : 0; + group += __popc((workSendMask & ~workRecvMask) & ((1<>workIx); + bool hasRecv = 1 & (workRecvMask>>workIx); + bool isCopy = work->sendRank == ncclShmem.comm.rank; + bool isSend = !hasRecv || (hasSend && subtid < nSendWarpPerWork*WARP_SIZE); + + if (!isCopy && hasSend && hasRecv) { + // Translate thread ids to reflect just this send or recv as opposed to whole work. + if (isSend) { + subtn = nSendWarpPerWork*WARP_SIZE; + } else { + subtid -= nSendWarpPerWork*WARP_SIZE; + subtn = nRecvWarpPerWork*WARP_SIZE; + group += 1 + (nSendWarpPerWork >= nSendWarpsForExtraGroup ? 1 : 0); + } + } + + if (isCopy) { + reduceCopy + (subtid, subtn, 0, nullptr, false, 1, &work->sendAddr, 1, &work->recvAddr, (ssize_t)work->sendBytes); + } else if (isSend) { + if (work->sendProtoLL) { + runSend(subtid, subtn, group, work); } else { - runRecv>(tid, nthreads, group, args); + runSend>(subtid, subtn, group, work); } } else { - if (args->proto == NCCL_PROTO_LL) { - runSend(tid, nthreads, group, args); + if (work->recvProtoLL) { + runRecv(subtid, subtn, group, work); } else { - runSend>(tid, nthreads, group, args); + runRecv>(subtid, subtn, group, work); } } } diff --git a/src/enqueue.cc b/src/enqueue.cc index af57f1be4..0e07e3f25 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -12,25 +12,12 @@ #include "channel.h" #include "cudawrap.h" #include "transport.h" -#include + #include // std::memcpy #include // PRIx64 NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0); -static ncclResult_t initCollWorkElem(struct ncclInfo* collInfo, struct ncclWorkElem* work); -static ncclResult_t setCollWorkElem(uint64_t workCount, uint64_t workOffset, size_t lastChunkCount, struct ncclWorkElem* work); -static ncclResult_t initCollWorkElemReg(struct ncclComm* comm, struct ncclWorkElem* work, struct ncclChannel* channel, ncclRegBufferType regBufType, void* regBufSend[], void* regBufRecv[], struct ncclWorkElemReg* workElemReg); -static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nBytes, int nChannels); -static ncclResult_t initCollProxyOp(struct ncclInfo* collInfo, int channelId, uint64_t opCount, uint32_t nsteps, struct ncclProxyOp* proxyOp); -static ncclResult_t getTunerInfo(struct ncclInfo* collInfo, int collNetSupport, int nvlsSupport, int numPipeOps); -static ncclResult_t topoGetAlgoInfo(struct ncclInfo* collInfo, int collNetSupport, int nvlsSupport, int numPipeOps); -static ncclResult_t getChannnelThreadInfo(struct ncclInfo* collInfo); -static ncclResult_t computeCollWorkFunc(struct ncclInfo* collInfo); -static ncclResult_t getPatternInfo(struct ncclInfo* collInfo); -static ncclResult_t getLoopInfo(struct ncclInfo* collInfo); -static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport); - // Returns maximum kernel stack size of all CUDA kernels ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) { ncclResult_t result = ncclSuccess; @@ -64,114 +51,30 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) { return result; } -/*****************************************************************************/ -/* Launch system : synchronization and CUDA kernel launch */ -/*****************************************************************************/ +//////////////////////////////////////////////////////////////////////////////// +// Data movement metrics. -static void appendWorkElemColl( - struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, - int funcIndex, struct ncclWorkElem const *elem) { - struct ncclKernelPlan::Channel* chan = &plan->channels[channelId]; - struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue); - if (q && funcIndex == q->work.header.funcIndex - && elem->nWarps == q->work.elems[0].nWarps - && chan->nWorkElem < NCCL_MAX_WORK_ELEMENTS - && ncclWorkTypeColl == q->work.header.type) { - int e = chan->nWorkElem++; - q->work.elems[e] = *elem; // C++ struct assignment - return; - } - q = ncclMemoryStackAlloc(&comm->memScoped); - q->work.header.type = ncclWorkTypeColl; - q->work.header.funcIndex = funcIndex; - q->work.elems[0] = *elem; // C++ struct assignment - chan->nWorkElem = 1; - chan->nWork += 1; - ncclIntruQueueEnqueue(&chan->workQueue, q); +static inline int ncclFuncTrafficPerByte(ncclFunc_t func, int nRanks) { + switch (func) { + case ncclFuncAllReduce: return 2; + case ncclFuncAllGather: return nRanks; + case ncclFuncReduceScatter: return nRanks; + default: return 1; + } } - -static void appendWorkElemColl( - struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, - int funcIndex, struct ncclWorkElemReg const *elem) { - struct ncclKernelPlan::Channel* chan = &plan->channels[channelId]; - struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue); - if (q && funcIndex == q->work.header.funcIndex - && elem->elem.nWarps == q->work.regElems[0].elem.nWarps - && chan->nWorkElem < NCCL_MAX_WORK_ELEMENTS_REG - && ncclWorkTypeRegColl == q->work.header.type) { - int e = chan->nWorkElem++; - q->work.regElems[e] = *elem; // C++ struct assignment - q->work.regElems[e].elem.isUsed = 1; - return; - } - q = ncclMemoryStackAlloc(&comm->memScoped); - q->work.header.type = ncclWorkTypeRegColl; - q->work.header.funcIndex = funcIndex; - q->work.regElems[0] = *elem; // C++ struct assignment - q->work.regElems[0].elem.isUsed = 1; - chan->nWorkElem = 1; - chan->nWork += 1; - ncclIntruQueueEnqueue(&chan->workQueue, q); +static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) { + return func == ncclFuncReduceScatter ? nRanks*count : count; } - -static void finishWorkP2p(struct ncclWork* work) { - int nElem = 0; - for (int e=0; e < NCCL_MAX_WORK_ELEMENTS_P2P; e++) { - if (work->p2pElems[e].p2pType != ncclWorkP2pTypeUnused) - nElem = e+1; - } - int nGroup = 1; - while (nGroup < nElem) nGroup *= 2; - int nWarp = 1; - while (nWarp*nGroup <= (NCCL_MAX_NTHREADS/WARP_SIZE)/2) nWarp *= 2; - for (int i=0; i < nGroup; i++) { - work->p2pElems[i].ngroups = nGroup; - work->p2pElems[i].warpStart = i*(NCCL_MAX_NTHREADS/WARP_SIZE)/nGroup; - int extraWarp = nWarp >= 2 ? i%2 : 0; - work->p2pElems[i].nWarps = nWarp + extraWarp; - } +static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) { + return func == ncclFuncAllGather ? nRanks*count : count; } - -static void finishWork(struct ncclWork* work) { - if (work->header.type == ncclWorkTypeP2p) { - finishWorkP2p(work); - } +static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) { + return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count; } -static void appendWorkElemP2p( - struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, - struct ncclWorkElemP2p const *elem, bool fuseOk - ) { - int funcIndex = ncclDevFuncId_P2p(); - struct ncclKernelPlan::Channel* chan = &plan->channels[channelId]; - struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue); - if (q && funcIndex == q->work.header.funcIndex) { - if (!fuseOk) goto NewWork; - if (chan->p2pTailElem[elem->p2pType-1] < NCCL_MAX_WORK_ELEMENTS_P2P) { - for (int e = -2 + chan->p2pTailElem[elem->p2pType-1]; e >= 0; e -= 2) { - // Can't have multiple elements of the same ncclWork communicate with the - // same peer otherwise they would attempt to use that connection concurrently. - if (q->work.p2pElems[e].peer == elem->peer) - goto NewWork; - } - int e = chan->p2pTailElem[elem->p2pType-1]; - q->work.p2pElems[e] = *elem; // C++ struct assignment - chan->p2pTailElem[elem->p2pType-1] += 2; - return; - } - NewWork: - finishWorkP2p(&q->work); - } - q = ncclMemoryStackAlloc(&comm->memScoped); - q->work.header.type = ncclWorkTypeP2p; - q->work.header.funcIndex = ncclDevFuncId_P2p(); - chan->p2pTailElem[ncclWorkP2pTypeRecv-1] = 0; - chan->p2pTailElem[ncclWorkP2pTypeSend-1] = 1; - q->work.p2pElems[chan->p2pTailElem[elem->p2pType-1]] = *elem; // C++ struct assignment - chan->p2pTailElem[elem->p2pType-1] += 2; - chan->nWork += 1; - ncclIntruQueueEnqueue(&chan->workQueue, q); -} +/*****************************************************************************/ +/* Launch system : synchronization and CUDA kernel launch */ +/*****************************************************************************/ static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) { bool needed = true; @@ -179,459 +82,212 @@ static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelP if (needed) { struct ncclProxyOp* q = ncclMemoryPoolAlloc(&comm->memPool_ncclProxyOp, &comm->memPermanent); *q = *op; // C++ struct assignment - ncclIntruQueueEnqueue(&plan->channels[op->channelId].proxyOpQueue, q); + ncclIntruQueueEnqueue(&comm->planner.wipPlan.channels[op->channelId].proxyOpQueue, q); } return ncclSuccess; } -static ncclResult_t computeCollSteps(struct ncclInfo* collInfo, size_t workCount, uint32_t* steps) { - struct ncclComm* comm = collInfo->comm; - if (collInfo->coll == ncclFuncAllReduce) { - if (collInfo->algorithm == NCCL_ALGO_RING) - *steps = DIVUP(workCount, comm->nRanks * collInfo->chunkCount) * (comm->nRanks - 1) * 2 * collInfo->chunkSteps; - else if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) - *steps = DIVUP(workCount, comm->channels[0].collnetDirect.nHeads * collInfo->chunkCount) * collInfo->chunkSteps; - else if (collInfo->algorithm == NCCL_ALGO_NVLS || collInfo->algorithm == NCCL_ALGO_NVLS_TREE) - *steps = DIVUP(workCount, comm->channels[0].nvls.nHeads * collInfo->chunkCount) * collInfo->chunkSteps; - else - *steps = DIVUP(workCount, collInfo->chunkCount) * collInfo->chunkSteps; - } else if (collInfo->coll == ncclFuncReduceScatter) { - if (collInfo->algorithm == NCCL_ALGO_RING) - *steps = DIVUP(workCount, collInfo->chunkCount) * (comm->nRanks - 1) * collInfo->chunkSteps; - else - *steps = DIVUP(workCount, collInfo->chunkCount) * collInfo->chunkSteps; - } else if (collInfo->coll == ncclFuncAllGather) { - if (collInfo->algorithm == NCCL_ALGO_RING) - *steps = DIVUP(workCount, collInfo->chunkCount) * (comm->nRanks - 1) * collInfo->chunkSteps; - else - *steps = DIVUP(workCount, collInfo->chunkCount) * collInfo->chunkSteps; - } else { - *steps = DIVUP(workCount, collInfo->chunkCount) * collInfo->chunkSteps; - } - return ncclSuccess; -} - -static ncclResult_t computeCollAlignCount(struct ncclInfo* collInfo, size_t* alignCount) { - if (collInfo->protocol == NCCL_PROTO_SIMPLE) { - *alignCount = NCCL_SIMPLE_ALIGNMENT / ncclTypeSize(collInfo->datatype); - } else if (collInfo->protocol == NCCL_PROTO_LL128) { - *alignCount = NCCL_LL128_ALIGNMENT_PER_WARP / ncclTypeSize(collInfo->datatype) * (collInfo->nThreads / WARP_SIZE); - } else { - *alignCount = NCCL_LL_ALIGNMENT_PER_THREAD / ncclTypeSize(collInfo->datatype) * collInfo->nThreads; - } - return ncclSuccess; -} - -static ncclResult_t computeCollLastChunkInfo(struct ncclInfo* collInfo, size_t workCount, size_t alignCount, size_t* lastChunkCount) { - struct ncclComm* comm = collInfo->comm; - - if (collInfo->coll == ncclFuncAllReduce) { - if (collInfo->algorithm == NCCL_ALGO_RING) { - size_t remCount = workCount % (comm->nRanks * collInfo->chunkCount); - *lastChunkCount = DIVUP(DIVUP(remCount, comm->nRanks), alignCount) * alignCount; - } else if (collInfo->algorithm == NCCL_ALGO_NVLS || collInfo->algorithm == NCCL_ALGO_NVLS_TREE) { - size_t remCount = workCount % (comm->channels[0].nvls.nHeads * collInfo->chunkCount); - *lastChunkCount = DIVUP(DIVUP(remCount, comm->channels[0].nvls.nHeads), alignCount) * alignCount; - } else if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) { - size_t remCount = workCount % (comm->channels[0].collnetDirect.nHeads * collInfo->chunkCount); - *lastChunkCount = DIVUP(DIVUP(remCount, comm->channels[0].collnetDirect.nHeads), alignCount) * alignCount; - } else { - *lastChunkCount = collInfo->chunkCount; - } - } else { - *lastChunkCount = collInfo->chunkCount; - } - return ncclSuccess; -} - -static ncclResult_t getCollnetLoopInfo(struct ncclInfo* collInfo, int* nstepsPerLoop, int* nchunksPerLoop) { - switch (collInfo->pattern) { - case ncclPatternCollnetChain: - *nstepsPerLoop = *nchunksPerLoop = 1; break; - case ncclPatternNvls: - *nstepsPerLoop = 1; *nchunksPerLoop = collInfo->comm->channels[0].nvls.nHeads; break; - case ncclPatternCollnetDirect: - *nstepsPerLoop = 1; *nchunksPerLoop = collInfo->comm->channels[0].collnetDirect.nHeads; break; - default: - WARN("Unknown collnet pattern %d", collInfo->pattern); - return ncclInternalError; - } - return ncclSuccess; -} - -static ncclResult_t addCollnetCollToPlan( - struct ncclComm* comm, struct ncclKernelPlan* plan, int usableChannels, - struct ncclInfo* collInfo, int* nWorkBudget - ) { - ncclResult_t ret = ncclSuccess; - struct ncclKernelPlan::Channel *chans = plan->channels; - struct ncclWorkElem workElem; - uint64_t opCount = uint64_t(plan->collOpCount++) << 1 | 0; - ncclRegBufferType regBufType = collInfo->regBufType; - int nChannels = std::min(collInfo->nChannels, usableChannels); - size_t countPerChannel = DIVUP(collInfo->count, nChannels); - uint32_t typeSize = ncclTypeSize(collInfo->datatype); - int steps, nchunksPerLoop, nstepsPerLoop, nLoop; - - NCCLCHECK(computeCollChunkInfo(collInfo, collInfo->nBytes, collInfo->nChannels)); - NCCLCHECKGOTO(initCollWorkElem(collInfo, &workElem), ret, fail); - workElem.nChannels = nChannels; - - NCCLCHECKGOTO(getCollnetLoopInfo(collInfo, &nstepsPerLoop, &nchunksPerLoop), ret, fail); - nLoop = (int)DIVUP(collInfo->nBytes, (size_t)nChannels * nchunksPerLoop * collInfo->chunkSize); - steps = nstepsPerLoop * nLoop * collInfo->chunkSteps; - - for (int bid = 0; bid < nChannels; bid++) { - workElem.bid = bid; - // Add work elem - *nWorkBudget += chans[bid].nWork; - if (regBufType == NCCL_REGULAR_BUFFER) { - appendWorkElemColl(comm, plan, bid, collInfo->workFuncIndex, &workElem); - } else { - struct ncclWorkElemReg workElemReg; - NCCLCHECKGOTO(initCollWorkElemReg(comm, &workElem, &comm->channels[bid], regBufType, collInfo->regBufSend, collInfo->regBufRecv, &workElemReg), ret, fail); - appendWorkElemColl(comm, plan, bid, collInfo->workFuncIndex, &workElemReg); - } - *nWorkBudget -= chans[bid].nWork; // subtract delta of chans[c].nWork - - // Add proxy task. Empty collectives do not make it to the proxy thread - // since they don't imply synchronization for the user like p2p. - if (collInfo->nBytes != 0) { - struct ncclProxyOp proxyOp; - NCCLCHECKGOTO(initCollProxyOp(collInfo, bid, opCount, steps, &proxyOp), ret, fail); - NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOp), ret, fail); - } - - chans[bid].collBytes += countPerChannel * typeSize; - } - - plan->threadPerBlock = std::max(plan->threadPerBlock, collInfo->nThreads); - if (!plan->kernelSpecialized) { - plan->kernelFn = ncclDevKernelForFunc[collInfo->workFuncIndex]; - plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[collInfo->workFuncIndex]; - } - - if (comm->rank == 0) { - TRACE(NCCL_COLL, "collnetColl enqueue coll %s(%s, %s, %s, %s), nChannels %d, count %ld (nbytes %ld), usableChannel %d, chunkCount %d, funcIndex %d, nThreads %d", collInfo->opName, ncclOpToString(collInfo->op), ncclDatatypeToString(collInfo->datatype), ncclAlgoToString(collInfo->algorithm), ncclProtoToString(collInfo->protocol), collInfo->nChannels, collInfo->count, collInfo->workBytes, usableChannels, collInfo->chunkCount, collInfo->workFuncIndex, collInfo->nThreads); - } - -exit: - return ret; -fail: - goto exit; -} - -static ncclResult_t addTunedCollToPlan( - struct ncclComm* comm, struct ncclKernelPlan* plan, int usableChannels, - struct ncclInfo* collInfo, int* nWorkBudget +static void addWorkBatchToPlan( + struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, + enum ncclDevWorkType workType, int devFuncId, uint32_t workOffset, + int p2pRound = -1 ) { - ncclResult_t ret = ncclSuccess; - struct ncclKernelPlan::Channel *chans = plan->channels; - struct ncclWorkElem workElem; - uint64_t opCount = uint64_t(plan->collOpCount++) << 1 | 0; - uint64_t workCount; - uint64_t workOffset = 0; - uint32_t typeSize = ncclTypeSize(collInfo->datatype); - ncclRegBufferType regBufType = collInfo->regBufType; - size_t alignCount, lastChunkCount; - int least[/*nBid*/MAXCHANNELS]; - int maxIndexInLeast; - size_t maxBytesInLeast; - int nChannels = std::min(collInfo->nChannels, usableChannels); - int rnChannels = 0; - size_t countPerChannels; - size_t remCount = collInfo->count; - - NCCLCHECKGOTO(computeCollAlignCount(collInfo, &alignCount), ret, fail); - countPerChannels = DIVUP(DIVUP(collInfo->count, nChannels), alignCount) * alignCount; - nChannels = DIVUP(collInfo->count, countPerChannels); - NCCLCHECKGOTO(computeCollChunkInfo(collInfo, collInfo->nBytes, nChannels), ret, fail); - NCCLCHECKGOTO(initCollWorkElem(collInfo, &workElem), ret, fail); - - // Choose the `nBid` least loaded channels to do the work. This ensures - // all bids go to different channels in case they need to synchronize. - least[0] = 0; - maxIndexInLeast = 0; - maxBytesInLeast = chans[0].collBytes; - // Initialize least[] such that the first nBid channels are accounted for. - for (int b = 1; b < nChannels; b++) { - least[b] = b; - if (maxBytesInLeast < chans[b].collBytes) { - maxIndexInLeast = b; - maxBytesInLeast = chans[b].collBytes; - } - } - // Sort in the rest of the channels. If a channel has less work than the max - // member of least[], replace that member and compute the new max. We only - // sort channels when coll algo is not collnet. - for (int c = nChannels; c < usableChannels; c++) { - if (chans[c].collBytes < maxBytesInLeast) { - least[maxIndexInLeast] = c; - maxBytesInLeast = chans[least[0]].collBytes; - maxIndexInLeast = 0; - for (int b = 1; b < nChannels; b++) { - if (maxBytesInLeast < chans[least[b]].collBytes) { - maxIndexInLeast = b; - maxBytesInLeast = chans[least[b]].collBytes; - } + ncclKernelPlanner::WipPlan::Channel* chan = &comm->planner.wipPlan.channels[channelId]; + size_t workSize = ncclDevWorkSize(workType); + // Conditions causing us to create a new blank batch. + bool newBatch = (chan->workBatchQueue.tail == nullptr); + struct ncclDevWorkBatch* batch = nullptr; + if (!newBatch) { + batch = &chan->workBatchQueue.tail->batch; + // All of the conditions that prevent us from appending to current batch. + newBatch |= batch->workType != (uint8_t)workType; + newBatch |= batch->funcId != devFuncId; + // The following ensure the device can handle a batch this large. They have to + // account for all extension batches being fused together which is why + // wipBatch.workBytes and wipBatch.nP2ps aren't reset to 0 for a new extension + // batch further down. + newBatch |= NCCL_MAX_DEV_WORK_BATCH_BYTES < chan->wipBatch.workBytes + workSize; + if (workType == ncclDevWorkTypeP2p) { + newBatch |= chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH; + for (int i=0; i < chan->wipBatch.nP2ps; i++) { + newBatch |= p2pRound == chan->wipBatch.p2pRounds[i]; } } } - - for (int bid = 0; bid < nChannels && remCount > 0; bid++) { - int c = least[bid]; - - workCount = std::min(countPerChannels, remCount); - NCCLCHECKGOTO(computeCollLastChunkInfo(collInfo, workCount, alignCount, &lastChunkCount), ret, fail); - NCCLCHECKGOTO(setCollWorkElem(workCount, workOffset, lastChunkCount, &workElem), ret, fail); - - // Add work elem - *nWorkBudget += chans[c].nWork; - if (regBufType == NCCL_REGULAR_BUFFER) { - appendWorkElemColl(comm, plan, c, collInfo->workFuncIndex, &workElem); - } else { - struct ncclWorkElemReg workElemReg; - NCCLCHECKGOTO(initCollWorkElemReg(comm, &workElem, &comm->channels[c], regBufType, collInfo->regBufSend, collInfo->regBufRecv, &workElemReg), ret, fail); - appendWorkElemColl(comm, plan, c, collInfo->workFuncIndex, &workElemReg); + // Conditions causing us to create an extension batch (prev->nextExtends=1) + uint32_t offset = newBatch ? 0 : (workOffset - batch->offsetBase); + bool extendBatch = 63*workSize < offset; + extendBatch |= 0 != offset%workSize; + if (newBatch || extendBatch) { + if (!newBatch) batch->nextExtends = extendBatch; // Extending the previous batch. + struct ncclWorkBatchList* batchNode = ncclMemoryStackAlloc(&comm->memScoped); + ncclIntruQueueEnqueue(&chan->workBatchQueue, batchNode); + batch = &batchNode->batch; + batch->nextExtends = 0; + batch->workType = (uint32_t)workType; + batch->funcId = devFuncId; + batch->offsetBase = workOffset; + batch->offsetBitset = 0; + offset = 0; + if (newBatch) { + // Since extension batches are fused together on the device, and these values + // account for constraints on the fused batch, we only reset the values on + // a new batch + chan->wipBatch.workBytes = 0; + chan->wipBatch.nP2ps = 0; + // We don't count extension batches since this is used to derive a proxyOpCount, + // and we wan't all ops which are fused together to have the same value. + chan->nWorkBatchesP2p += (workType == ncclDevWorkTypeP2p ? 1 : 0); } - *nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork - - // Add proxy task. Empty collectives do not make it to the proxy thread - // since they don't imply synchronization for the user like p2p. - if (collInfo->nBytes != 0) { - uint32_t steps; - struct ncclProxyOp proxyOp; - NCCLCHECKGOTO(computeCollSteps(collInfo, workCount, &steps), ret, fail); - NCCLCHECKGOTO(initCollProxyOp(collInfo, c, opCount, steps, &proxyOp), ret, fail); - NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOp), ret, fail); - } - - remCount -= workCount; - chans[c].collBytes += workCount * typeSize; - workOffset += workCount; - rnChannels++; - } - - plan->threadPerBlock = std::max(plan->threadPerBlock, collInfo->nThreads); - if (!plan->kernelSpecialized) { - plan->kernelFn = ncclDevKernelForFunc[collInfo->workFuncIndex]; - plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[collInfo->workFuncIndex]; + plan->nWorkBatches += 1; } - - if (comm->rank == 0) { - TRACE(NCCL_COLL, "tunedColl enqueue coll %s(%s, %s, %s, %s), nChannels %d, count %ld (nbytes %ld), usableChannel %d, chunkCount %d, lastChunkCount %ld, funcIndex %d, nThreads %d", collInfo->opName, ncclOpToString(collInfo->op), ncclDatatypeToString(collInfo->datatype), ncclAlgoToString(collInfo->algorithm), ncclProtoToString(collInfo->protocol), rnChannels, collInfo->count, collInfo->workBytes, usableChannels, collInfo->chunkCount, lastChunkCount, collInfo->workFuncIndex, collInfo->nThreads); + batch->offsetBitset |= 1ull<<(offset/workSize); + chan->wipBatch.workBytes += workSize; + if (workType == ncclDevWorkTypeP2p) { + // We need to ensure that a single batch doesn't have multiple p2p's + // of the same round since they would use the same connections. + chan->wipBatch.p2pRounds[chan->wipBatch.nP2ps++] = p2pRound; } - -exit: - return ret; -fail: - goto exit; } -static ncclResult_t addCBDCollToPlan( - struct ncclComm* comm, struct ncclKernelPlan* plan, int usableChannels, - struct ncclInfo* collInfo, int* nWorkBudget - ) { - ncclResult_t ret = ncclSuccess; - struct ncclKernelPlan::Channel *chans = plan->channels; - size_t enqBytes; - uint64_t opCount = uint64_t(plan->collOpCount++) << 1 | 0; - size_t typeSize = ncclTypeSize(collInfo->datatype); - size_t workBytesTotal = collInfo->count * typeSize; - size_t workCountTotal = collInfo->count; - struct ncclWorkElem workElem; - size_t workOffset = 0; - size_t workCount; - ncclRegBufferType regBufType = collInfo->regBufType; - size_t alignCount; - size_t lastChunkCount; - int rnChannel = 0; - - NCCLCHECKGOTO(computeCollChunkInfo(collInfo, collInfo->aggnBytes, collInfo->nChannels), ret, fail); - NCCLCHECKGOTO(computeCollAlignCount(collInfo, &alignCount), ret, fail); - NCCLCHECKGOTO(initCollWorkElem(collInfo, &workElem), ret, fail); - for (int c = 0; c < usableChannels; c++) { - if (plan->maxBytesPerChannel <= chans[c].collBytes) continue; - if (workBytesTotal == 0) break; - enqBytes = std::min(plan->maxBytesPerChannel - chans[c].collBytes, workBytesTotal); - workCount = std::min(DIVUP(DIVUP(enqBytes, typeSize), alignCount) * alignCount, workCountTotal); - enqBytes = workCount * typeSize; - - NCCLCHECKGOTO(computeCollLastChunkInfo(collInfo, workCount, alignCount, &lastChunkCount), ret, fail); - NCCLCHECKGOTO(setCollWorkElem(workCount, workOffset, lastChunkCount, &workElem), ret, fail); - - // Add work elem - *nWorkBudget += chans[c].nWork; - if (regBufType == NCCL_REGULAR_BUFFER) { - appendWorkElemColl(comm, plan, c, collInfo->workFuncIndex, &workElem); - } else { - struct ncclWorkElemReg workElemReg; - NCCLCHECKGOTO(initCollWorkElemReg(comm, &workElem, &comm->channels[c], regBufType, collInfo->regBufSend, collInfo->regBufRecv, &workElemReg), ret, fail); - appendWorkElemColl(comm, plan, c, collInfo->workFuncIndex, &workElemReg); - } - *nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork - - // Add proxy task. Empty collectives do not make it to the proxy thread - // since they don't imply synchronization for the user like p2p. - if (collInfo->nBytes != 0) { - uint32_t steps; - struct ncclProxyOp proxyOp; - NCCLCHECKGOTO(computeCollSteps(collInfo, workCount, &steps), ret, fail); - NCCLCHECKGOTO(initCollProxyOp(collInfo, c, opCount, steps, &proxyOp), ret, fail); - NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOp), ret, fail); - } - - workBytesTotal -= enqBytes; - workCountTotal -= workCount; - chans[c].collBytes += enqBytes; - workOffset += workCount; - rnChannel++; - } - - plan->threadPerBlock = std::max(plan->threadPerBlock, collInfo->nThreads); - if (!plan->kernelSpecialized) { - plan->kernelFn = ncclDevKernelForFunc[collInfo->workFuncIndex]; - plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[collInfo->workFuncIndex]; - } - - if (comm->rank == 0) { - TRACE(NCCL_COLL, "CBDColl enqueue coll %s(%s, %s, %s, %s), nChannels %d, count %ld (nbytes %ld), usableChannel %d, maxBytesPerChannel %ld, chunkCount %d, lastChunkCount %ld, funcIndex %d, nThreads %d", collInfo->opName, ncclOpToString(collInfo->op), ncclDatatypeToString(collInfo->datatype), ncclAlgoToString(collInfo->algorithm), ncclProtoToString(collInfo->protocol), rnChannel, collInfo->count, collInfo->workBytes, usableChannels, plan->maxBytesPerChannel, collInfo->chunkCount, lastChunkCount, collInfo->workFuncIndex, collInfo->nThreads); +static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) { + ncclKernelPlanner::WipPlan::Channel* wipChannels = comm->planner.wipPlan.channels; + size_t workBytes = plan->workBytes; + size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch); + + plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MIN_NTHREADS); + + // If we can fit everything into the kernel args we do so. + if (sizeof(ncclDevKernelArgs) + batchBytes + workBytes <= comm->workArgsBytes) { + plan->workStorageType = ncclDevWorkStorageTypeArgs; + } + plan->kernelArgsSize = sizeof(struct ncclDevKernelArgs) + batchBytes; + plan->kernelArgsSize += (plan->workStorageType == ncclDevWorkStorageTypeArgs) ? workBytes : 0; + plan->kernelArgsSize = alignUp(plan->kernelArgsSize, 16); + plan->kernelArgs = (struct ncclDevKernelArgs*)ncclMemoryStackAlloc(&comm->memScoped, plan->kernelArgsSize, /*align=*/16); + plan->kernelArgs->comm = comm->devComm; + plan->kernelArgs->channelMask = plan->channelMask; + plan->kernelArgs->workStorageType = plan->workStorageType; + + // Put batches into the kernel arguments. The first batch for each channel + // must be located at batchZero[blockIdx.x]. To achieve this we round robin + // over the channels in ascending order until they're exhausted. + uint64_t hasBatchMask = plan->channelMask; + struct ncclDevWorkBatch* batchPrev[MAXCHANNELS] = {}; // {0...} + struct ncclDevWorkBatch* batchZero = (struct ncclDevWorkBatch*)(plan->kernelArgs+1); + int batchIx = 0; + while (hasBatchMask != 0) { + uint64_t tmpMask = hasBatchMask; // channels with a batch for this round. + do { + int c = popFirstOneBit(&tmpMask); + if (!ncclIntruQueueEmpty(&wipChannels[c].workBatchQueue)) { + struct ncclWorkBatchList* batchNode = ncclIntruQueueDequeue(&wipChannels[c].workBatchQueue); + if (batchPrev[c] != nullptr) { + batchPrev[c]->nextJump = int(&batchZero[batchIx] - batchPrev[c]); + } + batchPrev[c] = &batchZero[batchIx]; + batchZero[batchIx++] = batchNode->batch; + } + if (ncclIntruQueueEmpty(&wipChannels[c].workBatchQueue)) { + hasBatchMask ^= 1ull<= 1 upon entry. -static ncclResult_t addP2pToPlan( - struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, - bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes, bool fuseOk - ) { - struct ncclInfo info = { - isSendNotRecv ? ncclFuncSend : ncclFuncRecv, - isSendNotRecv ? "Send" : "Recv", - nullptr, addr, bytes, ncclInt8, ncclSum, peer, comm, (cudaStream_t)0, - /*Args*/1, 1 - }; - - int channelId; - NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, info.coll, &channelId)); - info.channelId = channelId; - - // 1 is connIndex - struct ncclConnInfo* conn = isSendNotRecv ? - &comm->channels[channelId].peers[peer]->send[1].conn : &comm->channels[channelId].peers[peer]->recv[1].conn; - info.protocol = ((conn->buffs[NCCL_PROTO_LL] != nullptr) && bytes <= ncclParamP2pLLThreshold()) ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE; - - int reg = 0; - if (info.protocol == NCCL_PROTO_SIMPLE) { - struct ncclReg* regRecord; - NCCLCHECK(ncclRegFind(comm, addr, bytes, ®Record)); - reg = regRecord && regRecord->nDevs ? 1 : 0; - } - - struct ncclProxyOp proxyOp = {}; - // May tune chunksize and set proxyOp.reg=0 if not using the network. - NCCLCHECK(ncclProxyComputeP2p(&info, &proxyOp, reg)); - - struct ncclWorkElemP2p elem = {0}; - elem.proto = info.protocol; - elem.peer = peer; - elem.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE; - elem.reg = proxyOp.reg; - elem.p2pType = isSendNotRecv ? ncclWorkP2pTypeSend : ncclWorkP2pTypeRecv; - elem.buffLo32 = uint32_t(reinterpret_cast(addr)); - elem.buffHi32 = reinterpret_cast(addr)>>32; - elem.countLo32 = uint32_t(bytes); - elem.countHi32 = bytes>>32; - elem.chunkSize = info.chunkSize; // computed by ncclProxyComputeP2p - - *nWorkBudget += plan->channels[channelId].nWork; - appendWorkElemP2p(comm, plan, channelId, &elem, fuseOk); - *nWorkBudget -= plan->channels[channelId].nWork; - - // Calculate the opCount after appendWorkElemP2p since it will always return - // with channel->nWork equal to one plus the work index this p2p settled in. - proxyOp.opCount = uint64_t(plan->channels[channelId].nWork)<<1 | 1; - NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp)); - return ncclSuccess; -} - -static void finishPlan(struct ncclKernelPlan* plan) { + // Merge-sort per-channel proxy-op lists by opCount when merging them into plan->proxyOpQueue + // Phase 1: scan first op of each channel, store opCount in headIds[c]. + uint64_t headIds[MAXCHANNELS]; + int nHeads = 0; int channelUbound = 0; - int channelCount = 0; - uint64_t channelMask = 0; - bool hasProxyOps = false; for (int c=0; c < MAXCHANNELS; c++) { - struct ncclWorkList* tail = ncclIntruQueueTail(&plan->channels[c].workQueue); - if (tail != nullptr) { - channelUbound = c+1; - channelCount += 1; - channelMask |= 1ull<work.header.isLast = 1; - finishWork(&tail->work); + struct ncclProxyOp* op = ncclIntruQueueHead(&wipChannels[c].proxyOpQueue); + headIds[c] = op ? op->opCount : uint64_t(-1); + if (op) nHeads += 1; + if (op) plan->hasProxyOps = true; + if (op) channelUbound = c+1; + } + // Phase 2: Dequeue from planner->channels[c], enqueue in merged order to plan + while (nHeads != 0) { + int c = -1; + uint64_t minId = uint64_t(-1); + // Find channel with least proxy-op id. We store the heads[c]->opCount in + // headIds[c] to remove indirect loads from this loop. + for (int c1=0; c1 < channelUbound; c1++) { + uint64_t id = headIds[c1]; + id = (id>>1 | id<<63); // Move tag bit to order collectives before p2p's + if (id < minId) { c = c1; minId = id; } } - hasProxyOps |= !ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue); + struct ncclProxyOp* op = ncclIntruQueueDequeue(&wipChannels[c].proxyOpQueue); + struct ncclProxyOp* opNext = ncclIntruQueueHead(&wipChannels[c].proxyOpQueue); + headIds[c] = opNext ? opNext->opCount : uint64_t(-1); + nHeads -= opNext ? 0 : 1; + ncclIntruQueueEnqueue(&plan->proxyOpQueue, op); } - plan->channelUbound = channelUbound; - plan->channelCount = channelCount; - plan->channelMask = channelMask; - plan->hasProxyOps = hasProxyOps; - plan->threadPerBlock = std::max(plan->threadPerBlock, 3*WARP_SIZE); } int64_t ncclParamLocalRegister(); NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 1); +struct ncclIpcCleanupCallback { + struct ncclCommCallback base; + void* ptr; +}; +static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) { + struct ncclIpcCleanupCallback* me = (struct ncclIpcCleanupCallback*)cb; + CUDACHECKIGNORE(cudaIpcCloseMemHandle(me->ptr)); + free(me); + return ncclSuccess; +} + static ncclResult_t registerIntraNodeBuffers( - struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclInfo* info + struct ncclComm* comm, struct ncclTaskColl* info, + void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], + void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], + struct ncclIntruQueue* cleanupQueue, + bool* regNeedConnect ) { ncclResult_t result = ncclSuccess; info->regBufType = NCCL_REGULAR_BUFFER; + *regNeedConnect = true; #if CUDART_VERSION >= 11030 if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) { bool regBufUsed = false; const void *sendbuff = info->sendbuff; void *recvbuff = info->recvbuff; - - if (info->coll == ncclFuncAllGather) - sendbuff = NULL; - else if (info->coll == ncclFuncReduceScatter) - recvbuff = NULL; + if (info->func == ncclFuncAllGather) sendbuff = NULL; + if (info->func == ncclFuncReduceScatter) recvbuff = NULL; + size_t elementSize = ncclTypeSize(info->datatype); + size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); + size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); /* first try local registration. */ if (ncclParamLocalRegister()) { - ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, ®BufUsed, info->regBufSend, info->regBufRecv); + ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, ®BufUsed, outRegBufSend, outRegBufRecv); } - if (regBufUsed == false && plan->persistent && ncclParamGraphRegister()) { - ncclNvlsGraphRegisterBuffer(comm, plan, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, ®BufUsed, info->regBufSend, info->regBufRecv); + if (regBufUsed == false && comm->planner.persistent && ncclParamGraphRegister()) { + ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, ®BufUsed, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts); } if (regBufUsed) { + *regNeedConnect = false; /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to * saturate bandwidth. */ if (comm->nNodes == 1) { - if (info->coll == ncclFuncReduceScatter) - info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5)); + if (info->func == ncclFuncReduceScatter) + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5)); else - info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4)); + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4)); } else { - info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6)); + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6)); } - info->regBufType = NCCL_NVLS_REG_BUFFER; } } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other comm->intraRanks < comm->localRanks && // only with inter-process & intra-node peers - plan->persistent && 0) { + comm->planner.persistent && 0) { /* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */ int localRank = comm->localRank; cudaPointerAttributes sattr, rattr; @@ -663,51 +319,56 @@ static ncclResult_t registerIntraNodeBuffers( // Open handles locally for (int i=0; i < comm->localRanks; i++) { if (i == localRank) { // Skip self - info->regBufSend[i] = nullptr; - info->regBufRecv[i] = nullptr; + outRegBufSend[i] = nullptr; + outRegBufRecv[i] = nullptr; } else { for (int sr=0; sr < 2; sr++) { // Get base address of mapping void* base; CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess)); // Get real buffer address by adding offset in the mapping - (sr == 0 ? info->regBufSend : info->regBufRecv)[i] = (char*)base + handles[i].offset[sr]; + (sr == 0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr]; // Enqueue reminder to close memory handle - struct ncclPointerList* q = ncclMemoryPoolAlloc(&comm->memPool_ncclPointerList, &comm->memPermanent); - q->ptr = base; - ncclIntruQueueEnqueue(&plan->ipcMemQueue, q); + struct ncclIpcCleanupCallback* cb = (struct ncclIpcCleanupCallback*)malloc(sizeof(struct ncclIpcCleanupCallback)); + cb->base.fn = cleanupIpc; + cb->ptr = base; + ncclIntruQueueEnqueue(cleanupQueue, &cb->base); + info->nCleanupQueueElts += 1; } } } info->regBufType = NCCL_IPC_REG_BUFFER; - } else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opFull.op != ncclDevPreMulSum && info->opFull.op != ncclDevSumPostDiv) { + } else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv) { + size_t elementSize = ncclTypeSize(info->datatype); + size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); + size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); int sendRegBufFlag = 0; int recvRegBufFlag = 0; void *sendHandle, *recvHandle; if (ncclParamLocalRegister()) { - ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, info->sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle); + ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle); info->sendMhandle = sendHandle; if (sendRegBufFlag) { - ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, info->recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle); + ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle); info->recvMhandle = recvHandle; } } - if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && plan->persistent && ncclParamGraphRegister()) { - ncclCollnetGraphRegisterBuffer(comm, plan, info->sendbuff, info->sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle); + if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && comm->planner.persistent && ncclParamGraphRegister()) { + ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts); info->sendMhandle = sendHandle; if (sendRegBufFlag) { - ncclCollnetGraphRegisterBuffer(comm, plan, info->recvbuff, info->recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle); + ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts); info->recvMhandle = recvHandle; } } if (sendRegBufFlag && recvRegBufFlag) { - info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 1)); + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 1)); info->regBufType = NCCL_COLLNET_REG_BUFFER; if (sendRegBufFlag == 1 && recvRegBufFlag == 1) { - INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, info->sendbuffSize, info->recvbuff, recvHandle, info->recvbuffSize); + INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, sendbuffSize, info->recvbuff, recvHandle, recvbuffSize); } } } @@ -716,177 +377,623 @@ static ncclResult_t registerIntraNodeBuffers( return result; } -static ncclResult_t getCBDCollnChannel(struct ncclKernelPlan* plan, struct ncclInfo* collInfo, int usableChannels) { - size_t firstEnqBytes; - size_t workBytesTotal = collInfo->workBytes; - struct ncclKernelPlan::Channel *chans = plan->channels; - int typeSize = ncclTypeSize(collInfo->datatype); - size_t maxCount = DIVUP(plan->maxBytesPerChannel, typeSize); +static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport); +static ncclResult_t getAlgoInfo( + struct ncclComm* comm, struct ncclTaskColl* task, + int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo = NULL +); +static ncclResult_t calcCollChunking( + struct ncclComm* comm, struct ncclTaskColl* task, int nChannels, size_t nBytes, + /*outputs*/uint32_t* outChunkSize, uint32_t* outDirectFlags, struct ncclProxyOp* proxyOp +); + +struct ncclKernelPlanBudget { + ssize_t inArgsBytes; // Space available within kernel args struct + ssize_t outArgsBytes; // Space available outside of args struct (fifo or persistent buf) +}; + +static bool testBudget( + struct ncclKernelPlanBudget* budget, int nWorkBatches, ssize_t workBytes + ) { + ssize_t batchBytes = nWorkBatches*sizeof(struct ncclDevWorkBatch); + bool ok = false; + ok |= (batchBytes + workBytes <= budget->inArgsBytes); + ok |= (batchBytes <= budget->inArgsBytes) && (workBytes <= budget->outArgsBytes); + return ok; +} + +// Called once per ncclGroup to organize the user submitted tasks in +// comm->planner so that they can be peeled off into plans. +ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo) { + struct ncclKernelPlanner* planner = &comm->planner; + // Tasks from the sorter come out ordered size descending. + struct ncclTaskColl* task = ncclTaskCollSorterDequeueAll(&planner->collSorter); + // Tasks are assembled by (fn,op,ty) size ascending. + struct ncclTaskColl* tasksByFnOpTy[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes]; + memset(tasksByFnOpTy, 0, sizeof(tasksByFnOpTy)); + int fnOpTyIndices[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes]; + int fnOpTyCount = 0; + + // Walk the size sorted tasks, binning them by (fn,op,ty). + while (task != nullptr) { + struct ncclTaskColl* next = task->next; + int index = ((int)task->func*ncclNumDevRedOps + (int)task->opDev.op)*ncclNumTypes + (int)task->datatype; + // Add to set of (fn,op,ty) indices on first occurrence + if (tasksByFnOpTy[index] == nullptr) fnOpTyIndices[fnOpTyCount++] = index; + // Add to LIFO for this (fn,op,ty) + task->next = tasksByFnOpTy[index]; + tasksByFnOpTy[index] = task; + // Next task + task = next; + } + + // Walk (fn,op,ty) bins, compute algo and proto etc. Then bin them by their + // scheduling constraints (collnet x nvls). + struct ncclIntruQueue collBins[2][2] = {}; + for (int cursor=0; cursor < fnOpTyCount; cursor++) { + struct ncclTaskColl* aggBeg = tasksByFnOpTy[fnOpTyIndices[cursor]]; + int collNetSupport = 0; + NCCLCHECK(getCollNetSupport(comm, aggBeg, &collNetSupport)); + int nvlsSupport = comm->nvlsSupport && (ncclNvlsSupported(aggBeg->opDev.op, aggBeg->datatype) || aggBeg->func == ncclFuncAllGather); + // Crudely estimate number of tasks per channel. This is using the wrong number + // of channels for NVLS algos, but knowing the algo requires having this value, + // so either be crude our iterate until fixed point, we chose the former. + int nTasksPerChannel = divUp(comm->planner.nTasksColl, comm->nChannels); + do { + struct ncclTaskColl* aggEnd = aggBeg->next; + struct ncclTaskColl agg = *aggBeg; + // We aggregate operations that are within 4X size of each other. + while (aggEnd != nullptr && aggEnd->trafficBytes < 4*aggBeg->trafficBytes) { + agg.count += aggEnd->count; + agg.trafficBytes += aggEnd->trafficBytes; + aggEnd = aggEnd->next; + } - if (workBytesTotal == 0) { - collInfo->nChannels = 1; - goto exit; + NCCLCHECK(getAlgoInfo(comm, &agg, collNetSupport, nvlsSupport, nTasksPerChannel, simInfo)); + agg.devFuncId = ncclDevFuncId(agg.func, agg.opDev.op, agg.datatype, agg.algorithm, agg.protocol); + + int isCollnet=0, isNvls=0; + switch (agg.algorithm) { + case NCCL_ALGO_NVLS: + case NCCL_ALGO_NVLS_TREE: + isNvls = 1; + isCollnet = agg.algorithm == NCCL_ALGO_NVLS && comm->nNodes > 1; + break; + case NCCL_ALGO_COLLNET_CHAIN: + case NCCL_ALGO_COLLNET_DIRECT: + isCollnet = 1; + break; + } + // Update the aggregated tasks with the computed values. + do { + struct ncclTaskColl* next = aggBeg->next; + aggBeg->algorithm = agg.algorithm; + aggBeg->protocol = agg.protocol; + aggBeg->nMaxChannels = agg.nMaxChannels; + aggBeg->nWarps = agg.nWarps; + aggBeg->devFuncId = agg.devFuncId; + aggBeg->isCollnet = isCollnet; + aggBeg->isNvls = isNvls; + ncclIntruQueueEnqueue(&collBins[isCollnet][isNvls], aggBeg); + aggBeg = next; + } while (aggBeg != aggEnd); + } while (aggBeg != nullptr); + } + + // Concatenate `collBins[*][*]` together into final list `planner->collTaskQueue`. + // Collnet is the outer dimension since that affects how we divide over the + // channels. + for (int isCollnet=0; isCollnet <= 1; isCollnet++) { + for (int isNvls=0; isNvls <= 1; isNvls++) { + ncclIntruQueueTransfer(&planner->collTaskQueue, &collBins[isCollnet][isNvls]); + } } - for (int c = 0; c < usableChannels; c++) { - if (plan->maxBytesPerChannel <= chans[c].collBytes) continue; - firstEnqBytes = std::min(plan->maxBytesPerChannel - chans[c].collBytes, workBytesTotal); - firstEnqBytes = DIVUP(firstEnqBytes, typeSize) * typeSize; - collInfo->nChannels = 1 + DIVUP((workBytesTotal - firstEnqBytes) / typeSize, maxCount); - break; + // Walk tasks again to: + // 1. Possibly register buffers. + // 2. Build ncclDevWorkColl structs. + // 3. Bin the work structs according to the number of valid channels they + // may be assigned to {collnet, nvls, standard} + task = ncclIntruQueueHead(&planner->collTaskQueue); + while (task != nullptr) { + // Build a ncclDevWorkColl[Reg?] struct for each task. + void* regBufSend[NCCL_MAX_LOCAL_RANKS]; + void* regBufRecv[NCCL_MAX_LOCAL_RANKS]; + bool regNeedConnect = true; + registerIntraNodeBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, ®NeedConnect); + + if (comm->runtimeConn && comm->initAlgoChannels[task->algorithm] == false) { + if (task->algorithm == NCCL_ALGO_NVLS_TREE && comm->initAlgoChannels[NCCL_ALGO_NVLS] == false && regNeedConnect == true) { + comm->initAlgoChannels[NCCL_ALGO_NVLS] = true; + algoNeedConnect[NCCL_ALGO_NVLS] = true; + } + if (task->algorithm != NCCL_ALGO_NVLS || regNeedConnect == true) { + comm->initAlgoChannels[task->algorithm] = true; + algoNeedConnect[task->algorithm] = true; + *needConnect = true; + } + } + + struct ncclDevWorkColl devWork = {}; + devWork.sendbuff = (void*)task->sendbuff; + devWork.recvbuff = (void*)task->recvbuff; + devWork.root = task->root; + devWork.nWarps = task->nWarps; + devWork.redOpArg = task->opDev.scalarArg; + devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr; + devWork.oneNode = (comm->nNodes == 1); + devWork.regUsed = task->regBufType; + + struct ncclWorkList* workNode; + switch (task->regBufType) { + case NCCL_REGULAR_BUFFER: + case NCCL_COLLNET_REG_BUFFER: + { workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); + workNode->workType = ncclDevWorkTypeColl; + workNode->size = sizeof(struct ncclDevWorkColl); + memcpy((void*)(workNode+1), (void*)&devWork, workNode->size); + } break; + case NCCL_IPC_REG_BUFFER: + { struct ncclDevWorkCollReg workReg = {}; + workReg.coll = devWork; + struct ncclChannel *channel0 = &comm->channels[0]; + for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) { + int peer = channel0->collnetDirect.down[i]; + if (peer == -1) break; + int j = comm->rankToLocalRank[peer]; // Get intra-node slot + workReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer + workReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer + } + for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) { + int peer = channel0->collnetDirect.up[i]; + if (peer == -1) break; + int j = comm->rankToLocalRank[peer]; + // Output buffer of root peer + workReg.upOutputs[i] = regBufRecv[j]; + } + workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); + workNode->workType = ncclDevWorkTypeCollReg; + workNode->size = sizeof(struct ncclDevWorkCollReg); + memcpy((void*)(workNode+1), (void*)&workReg, workNode->size); + } break; + case NCCL_NVLS_REG_BUFFER: + { struct ncclDevWorkCollReg workReg = {}; + workReg.coll = devWork; // C++ struct assignment + /* NVLS only has one send and recv buffer registered */ + workReg.dnInputs[0] = regBufSend[0]; + workReg.dnOutputs[0] = regBufRecv[0]; + workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); + workNode->workType = ncclDevWorkTypeCollReg; + workNode->size = sizeof(struct ncclDevWorkCollReg); + memcpy((void*)(workNode+1), (void*)&workReg, workNode->size); + } break; + default: + /* impossible value */ + WARN("Invalid regBufType %d", task->regBufType); + return ncclInvalidArgument; + } + + ncclIntruQueueEnqueue(&planner->collWorkQueue, workNode); + task = task->next; } -exit: return ncclSuccess; } static ncclResult_t scheduleCollTasksToPlan( - struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget + struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclKernelPlanBudget* budget ) { - struct ncclTasks* tasks = &comm->tasks; - size_t totalCBDBytes = tasks->workBytesTotal; - struct ncclInfo* collInfo; - - if (!ncclIntruQueueEmpty(&tasks->collQueue)) { - int usableChannels = 0, accChannels = 0; - - tasks->usableChannels = 1; - while (!ncclIntruQueueEmpty(&tasks->collQueue)) { - collInfo = ncclIntruQueueDequeue(&tasks->collQueue); - if (collInfo->count == 0) continue; - if (collInfo->algorithm == NCCL_ALGO_UNDEF) { - struct ncclInfo* aggInfo = ncclMemoryStackAlloc(&comm->memScoped); - struct ncclInfo* nextInfo = collInfo->next; - int nvlsSupport; - int collNetSupport; - - memcpy(aggInfo, collInfo, sizeof(struct ncclInfo)); - while (nextInfo) { - if (nextInfo->coll == aggInfo->coll && nextInfo->opFull.op == aggInfo->opFull.op && nextInfo->datatype == aggInfo->datatype) { - aggInfo->count += nextInfo->count; - nextInfo = nextInfo->next; - } else { - break; - } - } + struct ncclKernelPlanner* planner = &comm->planner; + // Estimate number of tasks that will fit in this plan. + int nPlanColls = 0; + size_t trafficBytes[2*2] = {0, 0, 0, 0}; // [collnet][nvls] + int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls] + int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls] + comm->nChannels, comm->nvlsChannels}; + do { + size_t workBytes = 0; + struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue); + struct ncclWorkList* workNode = ncclIntruQueueHead(&planner->collWorkQueue); + while (task != nullptr) { + int nBatches = divUp(nPlanColls, 4); // Rough guess: 4 colls per batch. + if (!testBudget(budget, nBatches, workBytes + workNode->size)) goto plan_full; + + nPlanColls += 1; + workBytes += workNode->size; + int kind = 2*task->isCollnet + task->isNvls; + trafficBytes[kind] += task->trafficBytes; + nChannels[kind] += task->nMaxChannels; + nChannels[kind] = std::min(nChannels[kind], nMaxChannels[kind]); + task = task->next; + workNode = workNode->next; + } + plan_full:; + } while (0); + + int kindPrev = -1; + constexpr size_t MinTrafficPerChannel = 512; + size_t trafficPerChannel = 0; + int channelId = 0; + size_t currentTraffic = 0; + while (nPlanColls!=0 && !ncclIntruQueueEmpty(&planner->collTaskQueue)) { + struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue); + struct ncclWorkList* workNode = ncclIntruQueueHead(&planner->collWorkQueue); + struct ncclDevWorkColl* devWork = (struct ncclDevWorkColl*)(workNode+1); + size_t elementSize = ncclTypeSize(task->datatype); + + int kind = 2*task->isCollnet + task->isNvls; + if (kind != kindPrev) { + trafficPerChannel = std::max(MinTrafficPerChannel, trafficBytes[kind]/nChannels[kind]); + kindPrev = kind; + channelId = 0; + currentTraffic = 0; + } - nvlsSupport = comm->nvlsSupport && ncclNvlsSupported(aggInfo->opFull.op, aggInfo->datatype); - NCCLCHECK(getCollNetSupport(aggInfo, &collNetSupport)); - NCCLCHECK(ncclInfoSetDerived(aggInfo, comm->nRanks)); - NCCLCHECK(getTunerInfo(aggInfo, collNetSupport, nvlsSupport, 1)); - NCCLCHECK(topoGetAlgoInfo(aggInfo, collNetSupport, nvlsSupport, 1)); - NCCLCHECK(getChannnelThreadInfo(aggInfo)); - NCCLCHECK(computeCollWorkFunc(aggInfo)); - NCCLCHECK(getPatternInfo(aggInfo)); - - // Try to assign algo and proto to all possible collectives - nextInfo = collInfo; - while (nextInfo) { - if (nextInfo->coll == aggInfo->coll && nextInfo->opFull.op == aggInfo->opFull.op && nextInfo->datatype == aggInfo->datatype) { - NCCLCHECK(ncclInfoSetDerived(nextInfo, comm->nRanks)); - NCCLCHECK(getTunerInfo(nextInfo, collNetSupport, nvlsSupport, 1)); - nextInfo->algorithm = aggInfo->algorithm; - nextInfo->protocol = aggInfo->protocol; - nextInfo->nThreads = aggInfo->nThreads; - nextInfo->pattern = aggInfo->pattern; - nextInfo->workFuncIndex = aggInfo->workFuncIndex; - nextInfo->aggnBytes = aggInfo->nBytes; - - NCCLCHECK(getChannnelThreadInfo(nextInfo)); - // if possible, start registration - registerIntraNodeBuffers(comm, plan, nextInfo); - // accumulate channels - accChannels += nextInfo->nChannels; - nextInfo = nextInfo->next; - } else { - break; - } - } - } // end of aggInfo + if (task->isCollnet) { + int nChannels = task->nMaxChannels; + // Ensure room for worst case of one new batch per channel + if (!testBudget(budget, plan->nWorkBatches + nChannels, plan->workBytes + workNode->size)) { + return ncclSuccess; + } + + size_t globalBytesPerElement = elementSize*ncclFuncMaxSendRecvCount(task->func, comm->nRanks, 1); + struct ncclProxyOp proxyOp; + uint32_t chunkSize, directFlags=0; + NCCLCHECK(calcCollChunking(comm, task, nChannels, globalBytesPerElement*task->count, &chunkSize, &directFlags, &proxyOp)); + devWork->channelLo = 0; + devWork->channelHi = nChannels-1; + devWork->collnet.count = task->count; + devWork->collnet.chunkCount = chunkSize/ncclTypeSize(task->datatype); + devWork->direct = directFlags; + + uint64_t proxyOpId = uint64_t(plan->collOpCount++)<<1 | 0; + for (int c=devWork->channelLo; c <= (int)devWork->channelHi; c++) { + proxyOp.channelId = c; + proxyOp.opCount = proxyOpId; + addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); + NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp)); + } + } else { // not task->isCollnet + constexpr size_t cellSize = 16; + int elementsPerCell = cellSize/elementSize; + size_t cells = divUp(task->count*elementSize, cellSize); + int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks); + size_t trafficPerElement = elementSize*trafficPerByte; + size_t trafficPerCell = cellSize*trafficPerByte; + size_t cellsPerChannel = std::min(cells, divUp(trafficPerChannel, trafficPerCell)); + size_t cellsLo; + if (channelId+1 == nMaxChannels[kind]) { // On last channel everything goes to "lo" + cellsLo = cells; + } else { + cellsLo = std::min(cells, (trafficPerChannel-currentTraffic)/trafficPerCell); + } + int nMidChannels = (cells-cellsLo)/cellsPerChannel; + size_t cellsHi = (cells-cellsLo)%cellsPerChannel; + int nChannels = (cellsLo!=0 ? 1 : 0) + nMidChannels + (cellsHi!=0 ? 1 : 0); + if (nMaxChannels[kind] < channelId + nChannels) { // Overflowed available channels + nMidChannels = nMaxChannels[kind] - channelId - 2; + cellsPerChannel = (cells-cellsLo)/(nMidChannels+1); + cellsHi = cellsPerChannel + (cells-cellsLo)%(nMidChannels+1); + } + if (cellsHi == 0 && nMidChannels != 0) { + cellsHi = cellsPerChannel; + nMidChannels -= 1; + } + if (cellsLo == 0) { // Least channel skipped. Make the next channel the new least. + channelId += 1; + if (nMidChannels == 0) { cellsLo = cellsHi; cellsHi = 0; } + else { cellsLo = cellsPerChannel; nMidChannels -= 1; } + } + size_t countMid = nMidChannels!=0 ? cellsPerChannel*elementsPerCell : 0; + size_t countLo = cellsLo*elementsPerCell; + size_t countHi = cellsHi*elementsPerCell; + (countHi != 0 ? countHi : countLo) -= cells*elementsPerCell - task->count; + + nChannels = (countLo!=0 ? 1 : 0) + nMidChannels + (cellsHi!=0 ? 1 : 0); + // Ensure room for worst case of one new batch per channel + if (!testBudget(budget, plan->nWorkBatches + nChannels, plan->workBytes + workNode->size)) { + return ncclSuccess; + } - if (collInfo->algorithm == NCCL_ALGO_NVLS || collInfo->algorithm == NCCL_ALGO_NVLS_TREE) { - usableChannels = std::max(usableChannels, comm->nvlsChannels); + devWork->channelLo = channelId; + devWork->channelHi = channelId + nChannels-1; + devWork->cbd.countLo = countLo; + devWork->cbd.countMid = countMid; + devWork->cbd.countHi = countHi; + + // calcCollChunking() uses global bytes instead of traffic which differs + // in that allreduce isn't multiplied by 2. + size_t globalBytesPerElement = elementSize*ncclFuncMaxSendRecvCount(task->func, comm->nRanks, 1); + struct ncclProxyOp proxyOpLo, proxyOpMid, proxyOpHi; + + uint32_t chunkSize, directFlags=0; + size_t grainSize = ncclProtoGrainSize(task->protocol); + if (countLo != 0) { + NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countLo, &chunkSize, &directFlags, &proxyOpLo)); + devWork->cbd.chunkGrainsLo = chunkSize/grainSize; + } + if (countHi != 0) { + NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countHi, &chunkSize, &directFlags, &proxyOpHi)); + devWork->cbd.chunkGrainsHi = chunkSize/grainSize; + } + if (nMidChannels != 0) { + NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countMid, &chunkSize, &directFlags, &proxyOpMid)); + devWork->cbd.chunkGrainsMid = chunkSize/grainSize; + } + devWork->direct = directFlags; + + // Update the current channel and vacant traffic budget. + if (countHi != 0) { + channelId += nChannels-1; + currentTraffic = countHi*trafficPerElement; + } else if (nMidChannels != 0) { + channelId += nChannels; + currentTraffic = 0; } else { - usableChannels = std::max(usableChannels, comm->collChannels); + currentTraffic += countLo*trafficPerElement; + } + + if (currentTraffic >= trafficPerChannel && channelId+1 != nMaxChannels[kind]) { + channelId += 1; + currentTraffic = 0; + } + + uint64_t proxyOpId = uint64_t(plan->collOpCount++)<<1 | 0; + for (int c=devWork->channelLo; c <= (int)devWork->channelHi; c++) { + struct ncclProxyOp* proxyOp; + if (c == (int)devWork->channelLo) { + proxyOp = &proxyOpLo; + } else if (c == (int)devWork->channelHi) { + proxyOp = &proxyOpHi; + } else { + proxyOp = &proxyOpMid; + } + proxyOp->channelId = c; + proxyOp->opCount = proxyOpId; + addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); + NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp)); } + } + + plan->channelMask |= (2ull<channelHi) - (1ull<channelLo); + plan->threadPerBlock = std::max(plan->threadPerBlock, task->nWarps*WARP_SIZE); + if (!plan->kernelSpecialized) { + plan->kernelFn = ncclDevKernelForFunc[task->devFuncId]; + plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[task->devFuncId]; + } - if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT || collInfo->algorithm == NCCL_ALGO_COLLNET_CHAIN || (collInfo->algorithm == NCCL_ALGO_NVLS && comm->nNodes > 1)) { - // substract collective which needs to be executed separately - totalCBDBytes -= collInfo->workBytes; - tasks->workBytesTotal -= collInfo->workBytes; - ncclIntruQueueEnqueue(&tasks->collnetQueue, collInfo); - } else if (collInfo->userTuned) { - // substract collective which needs to be executed separately - totalCBDBytes -= collInfo->workBytes; - tasks->workBytesTotal -= collInfo->workBytes; - ncclIntruQueueEnqueue(&tasks->collTunedQueue, collInfo); + if (comm->rank == 0) { + if (task->isCollnet) { + TRACE(NCCL_COLL, "Collective %s(%s, %s, %s, %s) count=%ld devFuncId=%d channel{Lo..Hi}={%d..%d} count=%ld chunkCount=%d", + ncclFuncToString(task->func), ncclDevRedOpToString(task->opDev.op), + ncclDatatypeToString(task->datatype), ncclAlgoToString(task->algorithm), + ncclProtoToString(task->protocol), + (long)task->count, task->devFuncId, devWork->channelLo, devWork->channelHi, + (long)devWork->collnet.count, devWork->collnet.chunkCount); } else { - ncclIntruQueueEnqueue(&tasks->collCBDQueue, collInfo); + TRACE(NCCL_COLL, "Collective %s(%s, %s, %s, %s) count=%ld devFuncId=%d channel{Lo..Hi}={%d..%d} count{Lo,Mid,Hi}={%ld,%ld,%ld} chunkBytes{Lo,Mid,Hi}={%d,%d,%d}", + ncclFuncToString(task->func), ncclDevRedOpToString(task->opDev.op), + ncclDatatypeToString(task->datatype), ncclAlgoToString(task->algorithm), + ncclProtoToString(task->protocol), + (long)task->count, task->devFuncId, devWork->channelLo, devWork->channelHi, + (long)devWork->cbd.countLo, (long)devWork->cbd.countMid, (long)devWork->cbd.countHi, + int(devWork->cbd.chunkGrainsLo*ncclProtoGrainSize(task->protocol)), + int(devWork->cbd.chunkGrainsMid*ncclProtoGrainSize(task->protocol)), + int(devWork->cbd.chunkGrainsHi*ncclProtoGrainSize(task->protocol))); } } - tasks->usableChannels = std::min(usableChannels, accChannels); + for (int i=0; i < task->nCleanupQueueElts; i++) { + ncclIntruQueueEnqueue(&plan->cleanupQueue, ncclIntruQueueDequeue(&planner->collCleanupQueue)); + } + ncclIntruQueueDequeue(&planner->collTaskQueue); + ncclIntruQueueDequeue(&planner->collWorkQueue); + nPlanColls -= 1; + planner->nTasksColl -= 1; + ncclIntruQueueEnqueue(&plan->workQueue, workNode); + plan->workBytes += workNode->size; } + return ncclSuccess; +} - /* Calculate maxBytesPerChannel for CBD colls and it should be 16 bytes aligned - * Note: it it not hard upper bound for maxBytes, we can relax it if any optimization - * is needed */ - plan->maxBytesPerChannel = DIVUP(DIVUP(totalCBDBytes, tasks->usableChannels), NCCL_BYTES_ALIGNMENT) * NCCL_BYTES_ALIGNMENT; - // First enqueue CBD colls - while (!ncclIntruQueueEmpty(&tasks->collCBDQueue)) { - // Get nChannels and peek whether the budget allows before we enqueue - collInfo = ncclIntruQueueHead(&tasks->collCBDQueue); - collInfo->nChannels = DIVUP(collInfo->workBytes * tasks->usableChannels, totalCBDBytes); - // Haven't got nChannels info yet, relax the budget boundary a bit. - if (*nWorkBudget < collInfo->nChannels) return ncclSuccess; +NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384); +NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0); - collInfo = ncclIntruQueueDequeue(&tasks->collCBDQueue); - NCCLCHECK(addCBDCollToPlan(comm, plan, tasks->usableChannels, collInfo, nWorkBudget)); - tasks->nTasksColl -= 1; - tasks->workBytesTotal -= collInfo->count * ncclTypeSize(collInfo->datatype); +// Put p2p op in plan assuming there is sizeof(ncclDevWorkBatch) in batch budget +// and sizeof(ncclDevWorkP2p) in work budget. "sendRank" and "recvRank" must +// match the corresponding values for this round of the p2p schedule (no -1's). +// No-op's are encoded with a -1 size. +static ncclResult_t addP2pToPlan( + struct ncclComm* comm, struct ncclKernelPlan* plan, + int nChannelsMin, int nChannelsMax, int p2pRound, + int sendRank, void* sendAddr, ssize_t sendBytes, + int recvRank, void* recvAddr, ssize_t recvBytes + ) { + constexpr int connIndex = 1; + bool selfSend = (sendRank == comm->rank); + // recv: dir=0, send: dir=1 + void* addrs[2] = {recvAddr, sendAddr}; + ssize_t bytes[2] = {recvBytes, sendBytes}; + bool protoLL[2] = {!selfSend, !selfSend}; + bool network[2] = {false, false}; + bool proxySameProcess[2] = {true, true}; + uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound); + if (!selfSend) { + for (int part=0; part < nChannelsMax; part++) { + int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part); + struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers; + for (int dir=0; dir <= 1; dir++) { + int peerRank = dir ? sendRank : recvRank; + struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex] + : &channelPeers[peerRank]->recv[connIndex]; + protoLL[dir] &= conn->conn.buffs[NCCL_PROTO_LL] != nullptr; + network[dir] |= conn->transportComm == (dir ? &netTransport.send : &netTransport.recv); + proxySameProcess[dir] &= conn->proxyConn.sameProcess; + } + } } - // Then enqueue collnet colls - while (!ncclIntruQueueEmpty(&tasks->collnetQueue)) { - collInfo = ncclIntruQueueHead(&tasks->collnetQueue); - if (*nWorkBudget < collInfo->nChannels) return ncclSuccess; + ssize_t thresholdLL = nChannelsMax*ncclParamP2pLLThreshold(); + ssize_t paramChunkSize = ncclParamChunkSize(); + // Arrays indexed by dir where recv=0, send=1: + int nChannels[2]; + int protocol[2]; + int stepSize[2]; + int chunkSize[2]; + int chunkDataSize[2]; + int chunkDataSize_u32fp8[2]; + bool registered[2]; + + for (int dir=0; dir < 2; dir++) { // 0=recv, 1=send + if (bytes[dir] != -1) protoLL[dir] &= bytes[dir] <= thresholdLL; + protocol[dir] = protoLL[dir] ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE; + + stepSize[dir] = comm->buffSizes[protocol[dir]]/NCCL_STEPS; + if (protocol[dir] == NCCL_PROTO_SIMPLE) stepSize[dir] = comm->p2pChunkSize; + chunkSize[dir] = stepSize[dir]; + if (paramChunkSize != 0) { + chunkSize[dir] = paramChunkSize; + } else if (network[dir]) { + // Tune chunk size for the network + if (protocol[dir] == NCCL_PROTO_SIMPLE && bytes[dir] < stepSize[dir]) chunkSize[dir] /= 4; + else if (bytes[dir] < 8*stepSize[dir]) chunkSize[dir] /= 2; + } - collInfo = ncclIntruQueueDequeue(&tasks->collnetQueue); - NCCLCHECK(addCollnetCollToPlan(comm, plan, tasks->usableChannels, collInfo, nWorkBudget)); - tasks->nTasksColl -= 1; + chunkDataSize[dir] = chunkSize[dir]; + if (protocol[dir] == NCCL_PROTO_LL) chunkDataSize[dir] /= 2; + chunkDataSize_u32fp8[dir] = u32fp8Encode(chunkDataSize[dir]); + chunkDataSize[dir] = u32fp8Decode(chunkDataSize_u32fp8[dir]); + chunkSize[dir] = chunkDataSize[dir]; + if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2; + + registered[dir] = false; + if (bytes[dir] > 0 && network[dir] && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) { + struct ncclReg* regRecord; + NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], ®Record)); + registered[dir] = (regRecord && regRecord->nDevs); + } + + if (bytes[dir] == -1) nChannels[dir] = 0; + else if (bytes[dir] == 0) nChannels[dir] = 1; + else { + ssize_t minPartSize = comm->nNodes > 1 ? stepSize[dir]/2 : stepSize[dir]/8; + ssize_t maxPartSize = comm->nNodes > 1 ? stepSize[dir] : stepSize[dir]*32; + nChannels[dir] = std::min(nChannelsMin, divUp(bytes[dir], minPartSize)); + size_t partSize = std::max(minPartSize, divUp(bytes[dir], nChannels[dir])); + while (partSize > maxPartSize && nChannels[dir] <= nChannelsMax/2) { + nChannels[dir] *= 2; + partSize = divUp(bytes[dir], nChannels[dir]); + } + } } - // Finally enqueue user-tuned colls - while (!ncclIntruQueueEmpty(&tasks->collTunedQueue)) { - collInfo = ncclIntruQueueHead(&tasks->collTunedQueue); - if (*nWorkBudget < collInfo->nChannels) return ncclSuccess; + struct ncclWorkList* workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); + workNode->workType = ncclDevWorkTypeP2p; + workNode->size = sizeof(struct ncclDevWorkP2p); + ncclIntruQueueEnqueue(&plan->workQueue, workNode); + uint32_t workOffset = plan->workBytes; + plan->workBytes += sizeof(struct ncclDevWorkP2p); + + struct ncclDevWorkP2p* work = (struct ncclDevWorkP2p*)(workNode+1); + work->nP2pChannels = comm->p2pnChannels; + work->channelBase = base; + work->nSendChannels = nChannels[1]; + work->sendProtoLL = protoLL[1]; + work->sendRegistered = registered[1]; + work->sendChunkSize_u32fp8 = chunkDataSize_u32fp8[1]; + work->sendRank = sendRank; + work->sendAddr = sendAddr; + work->sendBytes = sendBytes==-1 ? 0 : sendBytes; + work->nRecvChannels = nChannels[0]; + work->recvProtoLL = protoLL[0]; + work->recvRegistered = registered[0]; + work->recvChunkSize_u32fp8 = chunkDataSize_u32fp8[0]; + work->recvRank = recvRank; + work->recvAddr = recvAddr; + work->recvBytes = recvBytes==-1 ? 0 : recvBytes; + + struct ncclProxyOp proxyOps[2] = {}; + int nProxyOps = selfSend ? 0 : 2; + for (int dir=0; dir < nProxyOps; dir++) { + struct ncclProxyOp* op = &proxyOps[dir]; + op->root = dir ? sendRank : recvRank; + op->sliceSteps = 1; + op->chunkSteps = 1; + op->dtype = ncclInt8; + op->redOp = ncclSum; + op->protocol = protocol[dir]; + op->pattern = dir ? ncclPatternSend : ncclPatternRecv; + op->chunkSize = chunkSize[dir]; + op->reg = registered[dir]; + // The following are modified per channel part in addWorkToChannels(): + // op->buffer, op->nbytes, op->nsteps = ...; + } + + nChannelsMax = std::max(nChannels[0], nChannels[1]); + for (int part=0; part < nChannelsMax; part++) { + int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part); + plan->channelMask |= uint64_t(1)<nSendChannels : work->nRecvChannels; + void* addr = dir ? work->sendAddr : work->recvAddr; + size_t bytes = dir ? work->sendBytes : work->recvBytes; + + proxyOps[dir].recvbuff = nullptr; + if (nParts <= part) { + proxyOps[dir].nsteps = 0; + } else if (bytes == 0) { + proxyOps[dir].nsteps = 1; + proxyOps[dir].nbytes = 0; + } else { + size_t chunkDataSize = u32fp8Decode(dir ? work->sendChunkSize_u32fp8 : work->recvChunkSize_u32fp8); + size_t partBeg, partEnd; + ncclP2pPartBounds(nParts, part, bytes, &partBeg, &partEnd); + if (proxyOps[dir].reg) { + proxyOps[dir].nsteps = 1; + proxyOps[dir].recvbuff = (uint8_t*)addr+partBeg; + proxyOps[dir].nbytes = partEnd-partBeg; + } else { + proxyOps[dir].nsteps = divUp(partEnd-partBeg, chunkDataSize); + proxyOps[dir].nbytes = std::min(partEnd-partBeg, chunkDataSize); + } + if (proxyOps[dir].protocol == NCCL_PROTO_LL) { + proxyOps[dir].nbytes *= 2; + proxyOps[dir].nbytes = roundUp(proxyOps[dir].nbytes, sizeof(union ncclLLFifoLine)); + } + } - collInfo = ncclIntruQueueDequeue(&tasks->collTunedQueue); - NCCLCHECK(addTunedCollToPlan(comm, plan, tasks->usableChannels, collInfo, nWorkBudget)); - tasks->nTasksColl -= 1; + if (proxyOps[dir].nsteps != 0) { + // Calculate the opCount after adding batch since then the batch count will + // equal one plus the batch index this p2p settled in. + proxyOps[dir].channelId = channelId; + proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1; + NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir])); + } + } } return ncclSuccess; } -static size_t calcP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) { +static int calcP2pChannelCount(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) { size_t size = std::max(minSize, divUp(totalSize, minChannels)); int nChannels = minChannels; while (size > maxSize && nChannels <= maxChannels/2) { nChannels *= 2; size = divUp(totalSize, nChannels); } - return alignUp(size, minSize); + return nChannels; } static ncclResult_t scheduleP2pTasksToPlan( - struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget + struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclKernelPlanBudget* budget ) { - struct ncclTasks* tasks = &comm->tasks; int nRanks = comm->nRanks; - struct ncclTasks::Peer* peers = tasks->peers; - int const *sendOrder = tasks->p2pSendOrder; - int const *recvOrder = tasks->p2pRecvOrder; + struct ncclKernelPlanner::Peer* peers = comm->planner.peers; plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS); if (!plan->kernelSpecialized) { @@ -895,264 +1002,213 @@ static ncclResult_t scheduleP2pTasksToPlan( } // Compute how much to split operations - // Natural step size matching buffer steps. - ssize_t stepSize = comm->p2pChunkSize; // Try to use all channels int nChannelsMax = comm->p2pnChannelsPerPeer; int nChannelsMin = nChannelsMax; // Try to use all channels, but one channel per operation. while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2; - bool fuseOk = false; - // We can perform 8 send/recv per round per CTA. Make sure we jump between fused blocks at node boundaries. - while (tasks->nTasksP2p != 0) { - for (int i=0; i < tasks->p2pOrderSteps; i++) { - int sendPeer = sendOrder[i]; - int recvPeer = recvOrder[i]; - struct ncclTaskP2p* send = sendPeer != -1 ? ncclIntruQueueHead(&peers[sendPeer].sendQueue) : NULL; - struct ncclTaskP2p* recv = recvPeer != -1 ? ncclIntruQueueHead(&peers[recvPeer].recvQueue) : NULL; - if (sendPeer == comm->rank) { - if (recvPeer != comm->rank) { - WARN("Sendrecv plan not aligned for self"); - return ncclInternalError; - } - if (send && recv == nullptr) { + while (comm->planner.nTasksP2p != 0) { + for (int round=0; round < nRanks; round++) { + int sendRank = comm->p2pSchedule[round].sendRank; + int recvRank = comm->p2pSchedule[round].recvRank; + struct ncclTaskP2p* send = ncclIntruQueueHead(&peers[sendRank].sendQueue); + struct ncclTaskP2p* recv = ncclIntruQueueHead(&peers[recvRank].recvQueue); + if (send == nullptr && recv == nullptr) continue; + + if (sendRank == comm->rank) { + if (send != nullptr && recv == nullptr) { WARN("Trying to send to self without a matching recv"); return ncclInvalidUsage; } - if (send == nullptr && recv) { + if (send == nullptr && recv != nullptr) { WARN("Trying to recv to self without a matching send"); return ncclInvalidUsage; } } - if (send != nullptr || recv != nullptr) { - char* recvPtr = recv ? (char*)recv->buff : nullptr; - char* sendPtr = send ? (char*)send->buff : nullptr; - ssize_t recvBytes = recv ? recv->bytes : 0; - ssize_t sendBytes = send ? send->bytes : 0; - ssize_t minSize = comm->nNodes > 1 ? stepSize/2 : stepSize/8; - ssize_t maxSize = comm->nNodes > 1 ? stepSize : stepSize*32; - ssize_t recvChunkBytesMax = calcP2pChunkSize(recvBytes, nChannelsMin, nChannelsMax, minSize, maxSize); - ssize_t sendChunkBytesMax = calcP2pChunkSize(sendBytes, nChannelsMin, nChannelsMax, minSize, maxSize); - // Zero size send/recv are syncs, encode here with -1. - recvBytes = recv && recvBytes == 0 ? -1 : recvBytes; - sendBytes = send && sendBytes == 0 ? -1 : sendBytes; - // Advance to current chunk. Syncs will always have chunk=0 so no effect on the -1. - if (recv) recvPtr += recv->chunk*recvChunkBytesMax; - if (recv) recvBytes -= recv->chunk*recvChunkBytesMax; - if (send) sendPtr += send->chunk*sendChunkBytesMax; - if (send) sendBytes -= send->chunk*sendChunkBytesMax; - - do { - if ((i % (NCCL_MAX_WORK_ELEMENTS_P2P/2)) == 0) fuseOk = false; - ssize_t recvChunkBytes = std::min(recvBytes, recvChunkBytesMax); // -1 preserved - ssize_t sendChunkBytes = std::min(sendBytes, sendChunkBytesMax); - if (recvChunkBytes != 0) { - if (recvChunkBytes == -1) recvChunkBytes = 0; - if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget - NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, fuseOk)); - fuseOk = true; - recvPtr += recvChunkBytes; - recvBytes -= recvChunkBytes; - recv->chunk += 1; - if (recvBytes <= 0) { - recvBytes = 0; // in case still -1 - ncclIntruQueueDequeue(&peers[recvPeer].recvQueue); - tasks->nTasksP2p -= 1; - } - } - if (sendChunkBytes != 0) { - if (sendChunkBytes == -1) sendChunkBytes = 0; - if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget - NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes, fuseOk)); - fuseOk = true; - sendPtr += sendChunkBytes; - sendBytes -= sendChunkBytes; - send->chunk += 1; - if (sendBytes <= 0) { - sendBytes = 0; // in case still -1 - ncclIntruQueueDequeue(&peers[sendPeer].sendQueue); - tasks->nTasksP2p -= 1; - } - } - } while (sendBytes != 0 || recvBytes != 0); + ssize_t sendBytes = send ? send->bytes : -1; + ssize_t recvBytes = recv ? recv->bytes : -1; + void* sendBuff = send ? send->buff : nullptr; + void* recvBuff = recv ? recv->buff : nullptr; + + if (sendRank == comm->rank && send->buff == recv->buff) { + // Skip send to self in-place (we don't need to support this). + ncclIntruQueueDequeue(&peers[sendRank].sendQueue); + ncclIntruQueueDequeue(&peers[recvRank].recvQueue); + comm->planner.nTasksP2p -= 2; + } else { + // Ensure room for worst case of one new batch per channel. + if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) { + return ncclSuccess; + } + NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes)); + if (send != nullptr) { + ncclIntruQueueDequeue(&peers[sendRank].sendQueue); + comm->planner.nTasksP2p -= 1; + } + if (recv != nullptr) { + ncclIntruQueueDequeue(&peers[recvRank].recvQueue); + comm->planner.nTasksP2p -= 1; + } } } } return ncclSuccess; } -// Comparison of monotonic rolling counters. -static inline bool rollingLess32(uint32_t a, uint32_t b) { - constexpr uint32_t PositiveMax = uint32_t(-1)>>1; - return a-b > PositiveMax; -} -static inline uint32_t rollingMin32(uint32_t a, uint32_t b) { - constexpr uint32_t PositiveMax = uint32_t(-1)>>1; - return (b-a <= PositiveMax) ? a : b; -} - -// Spin until its safe to increase comm->workFifoSent to desiredSent. -static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredSent) { - if (__builtin_expect(rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent), false)) { - while (1) { - // We have to poll for notifications from device. - uint32_t* doneLive = comm->workFifoDone; - uint32_t ackd[MAXCHANNELS]; - for (int c=0; c < MAXCHANNELS; c++) { - ackd[c] = __atomic_load_n(&doneLive[c], __ATOMIC_RELAXED); - } - // Compiler-only fence to prevent fusion of loops to encourage dense loads. - __atomic_signal_fence(__ATOMIC_SEQ_CST); - - uint32_t ackdAll = comm->workFifoSent; - for (int c=0; c < MAXCHANNELS; c++) { - // ackdAll is min over all non-quiesced channels - if (ackd[c] != comm->channels[c].workFifoSent) - ackdAll = rollingMin32(ackdAll, ackd[c]); +// Spin until its safe to increase comm->workFifoProduced to desiredProduced. +static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduced) { + bool hasRoom = (desiredProduced - comm->workFifoConsumedLeast) <= comm->workFifoBytes; + if (hasRoom) return; + while (true) { + // We have to poll for notifications from device. + uint32_t* consumedLive = comm->workFifoConsumed; + uint32_t consumed[MAXCHANNELS]; + for (int c=0; c < MAXCHANNELS; c++) { + consumed[c] = __atomic_load_n(&consumedLive[c], __ATOMIC_RELAXED); + } + // Compiler-only fence to prevent fusion of loops to encourage dense loads. + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + uint32_t produced = comm->workFifoProduced; + uint32_t consumedLeast = produced; + for (int c=0; c < MAXCHANNELS; c++) { + // consumedLeast is min over all non-quiesced channels + if (consumed[c] != comm->channels[c].workFifoProduced) { + if ((produced - consumedLeast) < (produced - consumed[c])) { + consumedLeast = consumed[c]; + } } + } - // Compiler only fence to prevent fusion of loops to encourage dense stores. - __atomic_signal_fence(__ATOMIC_SEQ_CST); + // Compiler only fence to prevent fusion of loops to encourage dense stores. + __atomic_signal_fence(__ATOMIC_SEQ_CST); - for (int c=0; c < MAXCHANNELS; c++) { - // Advance counter on quiesced channels so they don't lag behind - // too far where they could get lost in 32-bit wraparound. - if (ackd[c] == comm->channels[c].workFifoSent) { - comm->channels[c].workFifoSent = ackdAll; - __atomic_store_n(&doneLive[c], ackdAll, __ATOMIC_RELAXED); - } + for (int c=0; c < MAXCHANNELS; c++) { + // Advance counter on quiesced channels so they don't lag behind + // too far where they could get lost in 32-bit wraparound. + if (consumed[c] == comm->channels[c].workFifoProduced) { + comm->channels[c].workFifoProduced = consumedLeast; + __atomic_store_n(&consumedLive[c], consumedLeast, __ATOMIC_RELAXED); } - comm->workFifoAckdMin = ackdAll; - - // See if that was enough. - if (!rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent)) break; - sched_yield(); } + comm->workFifoConsumedLeast = consumedLeast; + + hasRoom = (desiredProduced - comm->workFifoConsumedLeast) <= comm->workFifoBytes; + if (hasRoom) break; + sched_yield(); } } static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) { - bool persistent = plan->persistent; - int channelUbound = plan->channelUbound; - int nWork = 0; - for (int c=0; c < channelUbound; c++) nWork += plan->channels[c].nWork; - - struct ncclWork* workHeap; - if (!persistent) { - workHeap = comm->workFifoHeap; - } else { - workHeap = ncclMemoryStackAlloc(&comm->memScoped, nWork); - } - uint32_t ixMask = persistent ? ~uint32_t(0) : comm->workFifoDepth-1; - uint32_t ixSent; - if (persistent) { - ixSent = 0; - } else { - ixSent = comm->workFifoSent; - // First work for a channel has to be at workHeap+blockIdx.x which means - // we cannot tolerate fifo wraparound. So round up to the wrap boundary - // if not doing so would incur crossing it. - if (((ixSent + plan->channelCount-1) & ixMask) < (ixSent & ixMask)) { - ixSent = (ixSent + ixMask) & ~ixMask; - // Need to update workFifoSent so waitWorkFifoAvailable() knows we've - // skipped those elements. Consider if all the channels report quiesced, - // this way the skipped slots will be considered consumed as well. - comm->workFifoSent = ixSent; - } - waitWorkFifoAvailable(comm, ixSent + nWork); - } - uint32_t ixHead = ixSent; - ixSent += plan->channelCount; - int channelsWithWork = 0; // number of channels below `c` with work structs. - for (int c=0; c < channelUbound; c++) { - struct ncclWorkList* q = ncclIntruQueueHead(&plan->channels[c].workQueue); - // Offset of first work equals number of channels below with work. - uint32_t ix = ixHead + channelsWithWork; - channelsWithWork += q != nullptr ? 1 : 0; - while (q != nullptr) { - if (q->next != nullptr) { - q->work.header.workNext = int32_t(ixSent & ixMask) - int32_t(ixHead & ixMask); - } else { - q->work.header.inFifo = !persistent ? 1 : 0; - // Tell channel to ack us back ix+1 indicating that all slots up to and - // including ix have been consumed. - q->work.header.doneAcks = ix+1; - comm->channels[c].workFifoSent = ix+1; - } - workHeap[ix & ixMask] = q->work; // C++ struct assignment - q = q->next; - if (q != nullptr) ix = ixSent++; + size_t workBytes = plan->workBytes; + size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch); + void* fifoBuf; + uint32_t fifoCursor, fifoMask; + + switch (plan->workStorageType) { + case ncclDevWorkStorageTypeArgs: + plan->kernelArgs->workBuf = nullptr; + fifoBuf = (void*)plan->kernelArgs; + fifoCursor = sizeof(ncclDevKernelArgs) + batchBytes; + fifoMask = ~0u; + break; + case ncclDevWorkStorageTypeFifo: + fifoBuf = comm->workFifoBuf; + fifoCursor = comm->workFifoProduced; + fifoMask = comm->workFifoBytes-1; + waitWorkFifoAvailable(comm, fifoCursor + workBytes); + plan->kernelArgs->workBuf = comm->workFifoBufDev; + break; + case ncclDevWorkStorageTypePersistent: + ncclMemoryStackPush(&comm->memScoped); + fifoBuf = ncclMemoryStackAlloc(&comm->memScoped, workBytes, /*align=*/16); + fifoCursor = 0; + fifoMask = ~0u; + break; + default: + return ncclInternalError; + } + plan->kernelArgs->workMask = fifoMask; + + // Batches were placed after kernelArgs by finishPlan(). Only thing left to + // do is translate the work offset from zero based (in plan) to: + // ncclDevWorkStorageTypeArgs: offset from beginning of kernel args + // ncclDevWorkStorageTypeFifo: offset from base of fifo + // ncclDevWorkStorageTypePersistent: no translation since our dedicated buffer will also begin at zero. + struct ncclDevWorkBatch* batchZero = (struct ncclDevWorkBatch*)(plan->kernelArgs+1); + for (int b=0; b < plan->nWorkBatches; b++) { + batchZero[b].offsetBase += fifoCursor; + } + + // Write the channel-shared work structs. + struct ncclWorkList* workNode = ncclIntruQueueHead(&plan->workQueue); + while (workNode != nullptr) { + char* dst = (char*)fifoBuf; + char* src = (char*)(workNode+1); + for (int n = workNode->size; n != 0; n -= 16) { + memcpy( + __builtin_assume_aligned(dst + (fifoCursor & fifoMask), 16), + __builtin_assume_aligned(src, 16), + 16 + ); + fifoCursor += 16; + src += 16; } + workNode = workNode->next; } - if (!persistent) { - comm->workFifoSent = ixSent; - if (comm->workFifoHeapGdrHandle != nullptr) wc_store_fence(); - plan->workHead = &comm->devWorkFifoHeap[ixHead & ixMask]; - } else { - NCCLCHECK(ncclCudaMalloc(&plan->workHead, nWork)); - NCCLCHECK(ncclCudaMemcpy(plan->workHead, workHeap, nWork)); + switch (plan->workStorageType) { + case ncclDevWorkStorageTypeFifo: + comm->workFifoProduced = fifoCursor; + if (comm->workFifoBufGdrHandle != nullptr) wc_store_fence(); + break; + case ncclDevWorkStorageTypePersistent: + NCCLCHECK(ncclCudaMalloc(&plan->workBufPersistent, workBytes)); + plan->kernelArgs->workBuf = plan->workBufPersistent; + NCCLCHECK(ncclCudaMemcpy(plan->workBufPersistent, fifoBuf, workBytes)); + ncclMemoryStackPop(&comm->memScoped); + break; + default: break; } return ncclSuccess; } static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* plan) { uint64_t collOpCount = comm->sharedRes->collOpCount; + uint64_t p2pOpBump[MAXCHANNELS] = {/*0...*/}; // Advance comm's collOpCount by number of colls in this plan. comm->sharedRes->collOpCount += plan->collOpCount; - uint64_t p2pOpBump[MAXCHANNELS]; - struct ncclProxyOp* heads[MAXCHANNELS]; - uint64_t headIds[MAXCHANNELS]; - int nHeads = 0; - for (int c=0; c < plan->channelUbound; c++) { - p2pOpBump[c] = 0; - heads[c] = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue); - nHeads += (heads[c] != nullptr) ? 1 : 0; - headIds[c] = (heads[c] != nullptr) ? heads[c]->opCount : uint64_t(-1); - } - - while (nHeads != 0) { - int minChan = -1; - uint64_t minId = uint64_t(-1); - // We store the heads[c]->opCount in headIds[c] specifically to remove indirect - // loads from this loop which speeds it up considerably. - for (int c=0; c < plan->channelUbound; c++) { - uint64_t id = headIds[c]; - id = (id>>1 | id<<63); // Move tag bit to order collectives before p2p's - if (id < minId) { minChan = c; minId = id; } - } - - struct ncclProxyOp* q = heads[minChan]; - uint64_t oldId = headIds[minChan]; // same as q->opCount - // Advance heads[c] - heads[minChan] = q->enqNext; - if (q->enqNext == nullptr) nHeads -= 1; - headIds[minChan] = (q->enqNext != nullptr) ? q->enqNext->opCount : uint64_t(-1); - + struct ncclProxyOp* op = ncclIntruQueueHead(&plan->proxyOpQueue); + while (op != nullptr) { + uint64_t oldId = op->opCount; // Ignoring the bottom tag bit, opCount's are zero-based within plan so // translate them to the tip of the comm's history. if (oldId & 1) { // p2p // opCount is monotonic increasing within a plan's channel so just // remember last value to compute max. - p2pOpBump[minChan] = (oldId>>1) + 1; // +1 to ensure next plan doesn't collide - q->opCount = (comm->sharedRes->p2pOpCount[minChan]<<1) + oldId; + p2pOpBump[op->channelId] = (oldId>>1) + 1; // +1 to ensure next plan doesn't collide + op->opCount = (comm->sharedRes->p2pOpCount[op->channelId]<<1) + oldId; } else { // coll - q->opCount = (collOpCount<<1) + oldId; + op->opCount = (collOpCount<<1) + oldId; } - NCCLCHECK(ncclProxySaveOp(comm, q, nullptr)); - q->opCount = oldId; // Restore for next uploadProxyOps() + NCCLCHECK(ncclProxySaveOp(comm, op, nullptr)); + op->opCount = oldId; // Restore for next uploadProxyOps() + + struct ncclProxyOp* opNext = op->enqNext; if (!plan->persistent) { // Non-persistent kernels upload ops only once so can be free'd here. - ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q); + ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, op); } + op = opNext; } - for (int c=0; c < plan->channelUbound; c++) { - // Erase proxyOpQueue since all ops were free'd back to mempool. - if (!plan->persistent) ncclIntruQueueConstruct(&plan->channels[c].proxyOpQueue); + // Erase proxyOpQueue since all ops were free'd back to mempool. + if (!plan->persistent) ncclIntruQueueConstruct(&plan->proxyOpQueue); + + for (int c=0; c < MAXCHANNELS; c++) { // Advance channel's p2pOpCount by number of p2p's in this plan channel. comm->sharedRes->p2pOpCount[c] += p2pOpBump[c]; } @@ -1182,33 +1238,20 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim` if (plan->persistent) { comm->persistentRefs -= 1; - NCCLCHECK(ncclCudaFree(plan->workHead)); - for (int c=0; c < plan->channelUbound; c++) { - struct ncclProxyOp* q = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue); - while (q != nullptr) { - struct ncclProxyOp* q1 = q->enqNext; - ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q); - q = q1; - } - } - while (!ncclIntruQueueEmpty(&plan->ipcMemQueue)) { - struct ncclPointerList* q = ncclIntruQueueDequeue(&plan->ipcMemQueue); - CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr)); - ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q); - } - /* free mcHandle */ - while (!ncclIntruQueueEmpty(&plan->nvlsMcHandleQueue)) { - struct ncclNvlsMcHandleList* obj = ncclIntruQueueDequeue(&plan->nvlsMcHandleQueue); - NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size)); - INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size); - ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, obj); + NCCLCHECK(ncclCudaFree(plan->workBufPersistent)); + struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue); + while (q != nullptr) { + struct ncclProxyOp* q1 = q->enqNext; + ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q); + q = q1; } - while (!ncclIntruQueueEmpty(&plan->collnetHandleQueue)) { - struct ncclCollnetHandleList* obj = ncclIntruQueueDequeue(&plan->collnetHandleQueue); - NCCLCHECK(ncclCollnetDeregBuffer(comm, obj->proxyconn, obj->collnetHandle)); - INFO(NCCL_REG, "rank %d - deregistered collnet buffer handle %p, size %ld, buff %p", comm->rank, obj->collnetHandle, obj->size, obj->buffer); - ncclMemoryPoolFree(&comm->memPool_ncclCollnetHandleList, obj); + ncclResult_t result = ncclSuccess; + while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) { + struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue); + ncclResult_t res1 = cb->fn(comm, cb); // Expect to reclaim memory of cb + if (res1 != ncclSuccess) result = res1; } + NCCLCHECK(result); } ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); return ncclSuccess; @@ -1226,57 +1269,54 @@ static void persistentDestructor(void* plans_) { ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { ncclResult_t result = ncclSuccess; - struct ncclTasks* tasks = &comm->tasks; - bool persistent = ncclCudaGraphValid(tasks->capturingGraph); + struct ncclKernelPlanner* planner = &comm->planner; + bool persistent = ncclCudaGraphValid(planner->capturingGraph); + planner->persistent = persistent; int nPlans = 0; // Poll for callbacks sent to us from other threads. Typically these free // resources from to our memory pools. NCCLCHECK(ncclCommPollCallbacks(comm, /*waitSome=*/false)); - // We already have one frame present which holds all of our tasks (which we - // are about to schedule). Now push an additional frame for allocating - // work structs (see appendWorkElem() variants all use scoped allocation). - ncclMemoryStackPush(&comm->memScoped); - - if (tasks->nTasksColl + tasks->nTasksP2p != 0) { + if (planner->nTasksColl + planner->nTasksP2p != 0) { do { + memset(&planner->wipPlan, 0, sizeof(planner->wipPlan)); + struct ncclKernelPlan* plan = ncclMemoryPoolAlloc(&comm->memPool_ncclKernelPlan, &comm->memPermanent); - ncclIntruQueueEnqueue(&comm->planQueue, plan); - nPlans += 1; plan->comm = comm; plan->reclaimer.fn = reclaimPlan; plan->persistent = persistent; + // uploadWork() promotes ncclDevWorkStorageType[Fifo|Buf]->Args if the work can fit. + plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent + : ncclDevWorkStorageTypeFifo; + struct ncclKernelPlanBudget budget; + budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs); // Non-persistent kernels fill up at most half of our fifo per kernel. - int nWorkBudget = plan->persistent ? INT_MAX : comm->workFifoDepth/2; - int nWorkBudgetOld = nWorkBudget; + budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2; // Drain coll tasks first. This is essential since we partition tasks based // on the work budget and p2p work isn't collective. If we were to drain p2p // first, the place where we cut the kernel could vary by rank which would // cause the "shortest channel first" channel picker to have divergent results. - if (tasks->nTasksColl != 0) { - NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &nWorkBudget), result, failure); + if (planner->nTasksColl != 0) { + NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure); } // And only drain p2p tasks once colls are depleted. - if (tasks->nTasksColl == 0 && tasks->nTasksP2p != 0) { - NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &nWorkBudget), result, failure); + if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) { + NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure); } - if (nWorkBudget == nWorkBudgetOld) { - // We weren't able to fit any tasks into our budget which means now we're - // stuck in an infinite loop. We defer this check until here, instead of - // doing it in comm init, to permit testing with insanely shallow queues - // for cases where that's expected to still work (e.g. few channels). - WARN("'NCCL_WORK_FIFO_DEPTH=%d' is too small. Minimum value is %d", comm->workFifoDepth, 2*MAXCHANNELS); - result = ncclInvalidUsage; - goto failure; + finishPlan(comm, plan); + if (plan->workBytes != 0) { + ncclIntruQueueEnqueue(&planner->planQueue, plan); + nPlans += 1; } - finishPlan(plan); - } while (tasks->nTasksColl + tasks->nTasksP2p != 0); + } while (planner->nTasksColl + planner->nTasksP2p != 0); + + struct ncclKernelPlan* planHead = ncclIntruQueueHead(&planner->planQueue); + planner->unlaunchedPlansHead = planHead; - struct ncclKernelPlan* planHead = ncclIntruQueueHead(&comm->planQueue); - comm->unlaunchedPlansHead = planHead; + if (nPlans == 0) return ncclSuccess; // Semantically we want these dependencies for the kernels launched: // 1. Launch host task on hostStream. @@ -1292,15 +1332,15 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { // 7. userStream[1...] each waits on deviceStream // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires // at least one of the two streams to be strong-stream. - cudaStream_t launchStream = tasks->streams->stream; - NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, failure); + cudaStream_t launchStream = planner->streams->stream; + NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream), result, failure); // Create dependency for device stream on user streams. First from extra user // streams to deviceStream. Then deviceStream to first user stream. - for (struct ncclCudaStreamList* l=tasks->streams->next; l != nullptr; l = l->next) { - NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure); + for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) { + NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure); } - NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure); + NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure); if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) { // We have to launch host tasks to push proxy args. We are careful to only @@ -1310,28 +1350,24 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { if (plan->hasProxyOps) { if (!acquired) { acquired = true; - NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure); + NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure); } - NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure); + NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure); } } if (acquired) { // Make to-be-launched kernels dependent on just-launched host stream tasks. - NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure); - NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure); + NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure); + NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure); } } if (persistent) { comm->persistentRefs += nPlans; - NCCLCHECKGOTO(ncclCudaGraphAddDestructor(tasks->capturingGraph, persistentDestructor, (void*)planHead), result, failure); + NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure); } } - - if (false) { - failure: - ncclMemoryStackPop(&comm->memScoped); // deallocate ncclWork's - } +failure: return result; } @@ -1349,13 +1385,21 @@ NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote); #endif ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) { - struct ncclTasks* tasks = &comm->tasks; - void *fn = plan->kernelFn; - cudaStream_t launchStream = tasks->streams->stream; - dim3 grid = {(unsigned)plan->channelCount, 1, 1}; + struct ncclKernelPlanner* planner = &comm->planner; + int nChannels = countOneBits(plan->channelMask); + void* sym = plan->kernelFn; + dim3 grid = {(unsigned)nChannels, 1, 1}; dim3 block = {(unsigned)plan->threadPerBlock, 1, 1}; - size_t smem = ncclShmemDynamicSize(comm->cudaArch); - void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead}; + int smem = ncclShmemDynamicSize(comm->cudaArch); + cudaStream_t launchStream = planner->streams->stream; + void* extra[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, plan->kernelArgs, + CU_LAUNCH_PARAM_BUFFER_SIZE, &plan->kernelArgsSize, + CU_LAUNCH_PARAM_END + }; + + CUfunction fn; + CUDACHECK(cudaGetFuncBySymbol(&fn, sym)); #if CUDART_VERSION >= 11080 int driverVersion; @@ -1364,8 +1408,8 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan int compCap = comm->compCap; unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0; - cudaLaunchConfig_t launchConfig = {0}; - cudaLaunchAttribute launchAttrs[3]; + CUlaunchConfig launchConfig = {0}; + CUlaunchAttribute launchAttrs[3]; int attrs = 0; /* Cooperative Group Array (CGA) * On sm90 and later we have an extra level of hierarchy where we @@ -1380,31 +1424,37 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan if (clusterSize) { // Grid dimension must be divisible by clusterSize if (grid.x % clusterSize) clusterSize = 1; - launchAttrs[attrs].id = cudaLaunchAttributeClusterDimension; - launchAttrs[attrs++].val.clusterDim = {clusterSize, 1, 1}; - launchAttrs[attrs].id = cudaLaunchAttributeClusterSchedulingPolicyPreference; - launchAttrs[attrs++].val.clusterSchedulingPolicyPreference = cudaClusterSchedulingPolicySpread; + launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; + launchAttrs[attrs++].value.clusterDim = {clusterSize, 1, 1}; + launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE; + launchAttrs[attrs++].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD; } #if CUDART_VERSION >= 12000 if (compCap >= 90 && driverVersion >= 12000) { // Set the NCCL Mem Sync domain on CUDA 12.0 and later (sm90) - launchAttrs[attrs].id = cudaLaunchAttributeMemSyncDomain; - launchAttrs[attrs++].val.memSyncDomain = (cudaLaunchMemSyncDomain) ncclParamMemSyncDomain(); + launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN; + launchAttrs[attrs++].value.memSyncDomain = (CUlaunchMemSyncDomain) ncclParamMemSyncDomain(); } #endif - launchConfig.gridDim = grid; - launchConfig.blockDim = block; - launchConfig.dynamicSmemBytes = smem; + launchConfig.gridDimX = grid.x; + launchConfig.gridDimY = grid.y; + launchConfig.gridDimZ = grid.z; + launchConfig.blockDimX = block.x; + launchConfig.blockDimY = block.y; + launchConfig.blockDimZ = block.z; + launchConfig.sharedMemBytes = smem; launchConfig.attrs = launchAttrs; launchConfig.numAttrs = attrs; - launchConfig.stream = launchStream; + launchConfig.hStream = launchStream; - CUDACHECK(cudaLaunchKernelExC(&launchConfig, fn, args)); + //CUDACHECK(cudaLaunchKernelExC(&launchConfig, fnAddr, args)); + CUCHECK(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra)); return ncclSuccess; } #endif // Standard kernel launch - CUDACHECK(cudaLaunchKernel(fn, grid, block, args, smem, launchStream)); + CUCHECK(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra)); + //CUDACHECK(cudaLaunchKernel(fnAddr, grid, block, args, smem, launchStream)); return ncclSuccess; } @@ -1426,35 +1476,30 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { ncclResult_t result = ncclSuccess; - struct ncclTasks* tasks = &comm->tasks; - tasks->workBytesTotal = 0; // Just in case subtraction during scheduleCollTasksToPlan() doesn't get to 0 - - // Deallocate ncclWork's. This frame exists so long as ncclLaunchPrepare - // succeeded, and if it ncclLaunchPrepare didn't succeed we wouldn't be here. - ncclMemoryStackPop(&comm->memScoped); + struct ncclKernelPlanner* planner = &comm->planner; - if (!ncclIntruQueueEmpty(&comm->planQueue)) { + if (!ncclIntruQueueEmpty(&planner->planQueue)) { // Reset queue to empty without destroying plans since those will be sent // back to us for reclaiming via callbackQueue. - ncclIntruQueueConstruct(&comm->planQueue); - cudaStream_t launchStream = tasks->streams->stream; // First user stream gets launch + ncclIntruQueueConstruct(&planner->planQueue); + cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch // Create dependency for deviceStream on launchStream. We know that deviceStream // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare), // so we can say that launchStream subsumes it. - NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1); + NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1); resume1: // Create dependency for other user streams (skip launch stream) on deviceStream. // Again, the user streams haven't been touched since deviceStream waited on them // so we can say they are subsumed by deviceStream. - struct ncclCudaStreamList* sl = tasks->streams->next; - tasks->streams = nullptr; // Reset comm->tasks.streams to empty. + struct ncclCudaStreamList* sl = planner->streams->next; + planner->streams = nullptr; // Reset comm->planner.streams to empty. while (sl != nullptr) { - NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2); + NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2); resume2: sl = sl->next; } // Release device stream as acquired in ncclLaunchPrepare() - NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, resume3); + NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream), result, resume3); resume3:; } return result; @@ -1464,15 +1509,20 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { /* Enqueueing system : computation of kernel and proxy operations parameters */ /*****************************************************************************/ -static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport) { +static inline ncclResult_t getCollNetSupport( + struct ncclComm* comm, struct ncclTaskColl* info, int* collNetSupport + ) { // Translate ncclAvg and PreMulSum - ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op; - *collNetSupport = info->comm->collNetSupport; - switch (info->coll) { + ncclRedOp_t netOp = info->opHost; + if (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv) { + netOp = ncclSum; + } + *collNetSupport = comm->collNetSupport; + switch (info->func) { case ncclFuncAllReduce: case ncclFuncReduce: case ncclFuncReduceScatter: - *collNetSupport &= info->comm->collNetSupportMatrix[netOp][info->datatype]; + *collNetSupport &= comm->collNetSupportMatrix[netOp][info->datatype]; break; default: break; @@ -1480,339 +1530,329 @@ static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNet return ncclSuccess; } +static void initCollCostTable(float** collCostTable) { + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++) { + for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) { + table[a][p] = NCCL_ALGO_PROTO_IGNORE; + } + } +} + // numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency. -static ncclResult_t topoGetAlgoInfo(struct ncclInfo* collInfo, int collNetSupport, int nvlsSupport, int numPipeOps) { - struct ncclComm* comm = collInfo->comm; +static ncclResult_t updateCollCostTable( + struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes, + int collNetSupport, int nvlsSupport, int numPipeOps, + float** collCostTable, int* backupAlgo, int* backupProto, float* backupTime + ) { + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + if (comm->nRanks == 1) { - collInfo->algorithm = NCCL_ALGO_RING; - collInfo->protocol = NCCL_PROTO_SIMPLE; - } - else if (collInfo->algorithm == NCCL_ALGO_UNDEF || collInfo->protocol == NCCL_PROTO_UNDEF) { - float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete. - float backupMinTime = 3600000000.0; - bool backup = false; - int backupAlgo = NCCL_ALGO_UNDEF; // back up algo and proto if no algo/proto is picked up. - int backupProto = NCCL_PROTO_UNDEF; - // Find algorithm / protocol. - collInfo->algorithm = -1; - collInfo->protocol = -1; - int nAlgos = NCCL_NUM_ALGORITHMS; - for (int a=0; anNodes > 1) continue; - /* now we only support single-node NVLS allgather and reducescatter */ - if (a == NCCL_ALGO_NVLS && (collInfo->coll == ncclFuncAllGather || collInfo->coll == ncclFuncReduceScatter) && comm->nNodes > 1) continue; - - for (int p=0; p= 0 && time < minTime) { - collInfo->algorithm = a; - collInfo->protocol = p; - minTime = time; - } - } else { - if (time >= 0 && time < backupMinTime) { - backupAlgo = a; - backupProto = p; - backupMinTime = time; - } - } - } - } + table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; + return ncclSuccess; + } - if (collInfo->algorithm == NCCL_ALGO_UNDEF || collInfo->protocol == NCCL_PROTO_UNDEF) { - if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) { - WARN("Error : no algorithm/protocol available"); - return ncclInternalError; + for (int a=0; afunc != ncclFuncAllGather) continue; + if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue; + /* now we only support single-node NVLS allgather and reducescatter */ + if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue; + for (int p=0; pfunc, a, p, nBytes, numPipeOps, &time, &backup)); + if (!backup) { + table[a][p] = time; + } else { + if (time >= 0.0 && time < *backupTime) { + *backupAlgo = a; + *backupProto = p; + *backupTime = time; + } } - collInfo->algorithm = backupAlgo; - collInfo->protocol = backupProto; } - if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", collInfo->nBytes, collInfo->algorithm, collInfo->protocol, minTime); - TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", collInfo->nBytes, collInfo->algorithm, collInfo->protocol, minTime); } return ncclSuccess; } -// Use the default topo-based tuner if tuner plugin is not successful. -// Call the plugin first. Let it set algo+proto, and/or nChannels. -// Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto. -// Finally, nChannels will be overriden by the plugin setting. -static ncclResult_t getTunerInfo(struct ncclInfo* collInfo, int collNetSupport, int nvlsSupport, int numPipeOps) { - collInfo->algorithm = NCCL_ALGO_UNDEF; - collInfo->protocol = NCCL_PROTO_UNDEF; - collInfo->nChannels = 0; - if (collInfo->comm->tuner != NULL) { - NCCLCHECK(collInfo->comm->tuner->getCollInfo( - collInfo->comm->tunerContext, collInfo->coll, collInfo->nBytes, - collNetSupport, nvlsSupport, numPipeOps, - &collInfo->algorithm, &collInfo->protocol, &collInfo->nChannels)); - } - - /* We only honor nChannels decision when user sets the nChannels by tuner plugin or the coll picks - * collnet algorithm. For other cases, we need to decide nChannels based on the maxBytesPerChannel */ - if (collInfo->nChannels != 0) - collInfo->userTuned = true; - else - collInfo->userTuned = false; - return ncclSuccess; -} - -/* Compute nChannels and nThreads. */ -static ncclResult_t getChannnelThreadInfo(struct ncclInfo* collInfo) { - struct ncclComm *comm = collInfo->comm; - int nc = comm->collChannels; - int nt = comm->maxThreads[collInfo->algorithm][collInfo->protocol]; - int threadThreshold = comm->threadThresholds[collInfo->algorithm][collInfo->protocol]; - - if (collInfo->nChannels == 0) { - /* not preset by users */ - if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) { - // CollNet channel tuning - int ncSwitch = 16; - bool flag = true; - while (ncSwitch >= 1 && flag) { - while ((flag = collInfo->nBytes < nc * nt * collInfo->comm->channels[0].collnetDirect.nHeads * threadThreshold) && nc > ncSwitch) { - if (nc == ncSwitch + ncSwitch / 2) threadThreshold /= 2; - nc--; - } - ncSwitch /= 2; - } - } else if (collInfo->algorithm == NCCL_ALGO_NVLS || collInfo->algorithm == NCCL_ALGO_NVLS_TREE) { - // NVLS should not need more than 16 channels to get peak BW. - nc = comm->nvlsChannels; - } else { - // Ring/Tree channel tuning - while (collInfo->nBytes < nc * nt * threadThreshold) { - if (nc >= 2) nc--; - else break; +static ncclResult_t topoGetAlgoInfo( + struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes, + float** collCostTable, int backupAlgo, int backupProto, float backupTime, ncclSimInfo_t* simInfo + ) { + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + + float minTime = 3600000000.0; + int algorithm = info->algorithm = NCCL_ALGO_UNDEF; + int protocol = info->protocol = NCCL_PROTO_UNDEF; + for (int a=0; a= 0.0 && table[a][p] < minTime) { + algorithm = a; + protocol = p; + minTime = table[a][p]; } } - collInfo->nChannels = nc; - } else { - nc = collInfo->nChannels; } - if (collInfo->nThreads == 0) { - if (collInfo->algorithm != NCCL_ALGO_NVLS && collInfo->algorithm != NCCL_ALGO_NVLS_TREE && - collInfo->algorithm != NCCL_ALGO_COLLNET_DIRECT) { - while (collInfo->nBytes < nc * nt * threadThreshold) { - if (nt % 128 == 0) nt /= 2; - else break; + info->algorithm = algorithm; + info->protocol = protocol; + float time = minTime; + + if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) { + if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) { + WARN("Error : no algorithm/protocol available"); + return ncclInternalError; + } + info->algorithm = backupAlgo; + info->protocol = backupProto; + time = backupTime; + } + if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time); + if (simInfo) simInfo->estimatedTime = time; + TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time); + + int nc = comm->nChannels; + int nt = comm->maxThreads[info->algorithm][info->protocol]; + int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol]; + if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { + // CollNet channel tuning + int ncSwitch = 16; + bool flag = true; + while (ncSwitch >= 1 && flag) { + while ((flag = nBytes < nc*nt*comm->channels[0].collnetDirect.nHeads*threadThreshold) && nc > ncSwitch) { + if (nc == ncSwitch+ncSwitch/2) threadThreshold /= 2; + nc--; } + ncSwitch /= 2; } - - if (collInfo->protocol == NCCL_PROTO_SIMPLE) { - if (collInfo->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync - // More threads or sync warps needed due to split thread model - if (collInfo->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE; + } else if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) { + // NVLS should not need more than 16 channels to get peak BW. + nc = comm->nvlsChannels; + } else { + // Ring/Tree channel tuning + while (nBytes < nc * nt * threadThreshold) { + if (nc >= 2) nc--; + else break; } - nt = nt / WARP_SIZE < 3 ? 3 * WARP_SIZE : nt; - collInfo->nThreads = nt; } - return ncclSuccess; -} - -static ncclResult_t getPatternInfo(struct ncclInfo* collInfo) { - switch (collInfo->coll) { - case ncclFuncBroadcast: - collInfo->pattern = collInfo->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break; - case ncclFuncReduce: - collInfo->pattern = collInfo->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break; - case ncclFuncReduceScatter: - case ncclFuncAllGather: - collInfo->pattern = - collInfo->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls : - collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect : - ncclPatternRing; break; - case ncclFuncAllReduce: - collInfo->pattern = - collInfo->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls : - collInfo->algorithm == NCCL_ALGO_NVLS_TREE ? ncclPatternNvlsTree : - collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect : - collInfo->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain : - collInfo->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : - ncclPatternRingTwice; break; - default: - WARN("Unknown pattern for collective %d algorithm %d", collInfo->coll, collInfo->algorithm); - return ncclInternalError; + if (info->algorithm != NCCL_ALGO_NVLS && info->algorithm != NCCL_ALGO_NVLS_TREE && + info->algorithm != NCCL_ALGO_COLLNET_DIRECT) { + while (nBytes < nc * nt * threadThreshold) { + if (nt % 128 == 0) nt /= 2; + else break; + } } - return ncclSuccess; -} - -static ncclResult_t computeCollWorkFunc(struct ncclInfo* collInfo) { - collInfo->workFuncIndex = ncclDevFuncId(collInfo->coll, collInfo->opFull.op, collInfo->datatype, collInfo->algorithm, collInfo->protocol); - return ncclSuccess; -} - -static ncclResult_t initCollWorkElem(struct ncclInfo* collInfo, struct ncclWorkElem* work) { - work->sendbuff = collInfo->sendbuff; - work->recvbuff = collInfo->recvbuff; - work->root = collInfo->root; - work->count = collInfo->count; - work->nWarps = collInfo->nThreads / WARP_SIZE; - work->redOpArg = collInfo->opFull.scalarArg; - work->redOpArgIsPtr = collInfo->opFull.scalarArgIsPtr; - work->chunkCount = collInfo->chunkCount; - work->regUsed = 0; - work->isUsed = 1; - - if (collInfo->comm->nNodes == 1) - work->oneNode = 1; - else - work->oneNode = 0; - if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) { - // Set direct direction for broadcast-gather (read or write) - work->direct = (collInfo->nBytes / collInfo->nChannels <= 1024 * 1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ; - } else { - work->direct = 0; + if (info->protocol == NCCL_PROTO_SIMPLE) { + if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync + // More threads or sync warps needed due to split thread model + if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE; } + nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt; + if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always. + info->nMaxChannels = nc; + info->nWarps = nt/WARP_SIZE; return ncclSuccess; } -static ncclResult_t setCollWorkElem(uint64_t workCount, uint64_t workOffset, size_t lastChunkCount, struct ncclWorkElem* work) { - work->workCount = workCount; - work->workOffset = workOffset; - work->lastChunkCount = lastChunkCount; - return ncclSuccess; -} - -static ncclResult_t initCollWorkElemReg(struct ncclComm* comm, struct ncclWorkElem* work, struct ncclChannel* channel, ncclRegBufferType regBufType, void* regBufSend[], void* regBufRecv[], struct ncclWorkElemReg* workElemReg) { - if (regBufType == NCCL_IPC_REG_BUFFER) { - workElemReg->elem = *work; - workElemReg->elem.regUsed = NCCL_IPC_REG_BUFFER; - for (int i = 0; i < NCCL_MAX_DIRECT_ARITY; i++) { - int peer = channel->collnetDirect.down[i]; - if (peer == -1) break; - int j = comm->rankToLocalRank[peer]; // Get intra-node slot - workElemReg->dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer - workElemReg->dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer - } - for (int i = 0; i < NCCL_MAX_DIRECT_ARITY; i++) { - int peer = channel->collnetDirect.up[i]; - if (peer == -1) break; - int j = comm->rankToLocalRank[peer]; - // Output buffer of root peer - workElemReg->upOutputs[i] = regBufRecv[j]; - } - } else if (regBufType == NCCL_NVLS_REG_BUFFER) { - workElemReg->elem = *work; - workElemReg->elem.regUsed = NCCL_NVLS_REG_BUFFER; - /* NVLS only has one send and recv buffer registered */ - workElemReg->dnInputs[0] = regBufSend[0]; - workElemReg->dnOutputs[0] = regBufRecv[0]; - } else if (regBufType == NCCL_COLLNET_REG_BUFFER) { - workElemReg->elem = *work; - workElemReg->elem.regUsed = NCCL_COLLNET_REG_BUFFER; - } else { - /* impossible value */ - WARN("Invalid regBufType %d\n", regBufType); - return ncclInvalidArgument; - } +// Use the default topo-based tuner if tuner plugin is not successful. +// Call the plugin first. Let it set algo+proto, and/or nChannels. +// Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto. +// Finally, nChannels will be overriden by the plugin setting. +static ncclResult_t getAlgoInfo( + struct ncclComm* comm, struct ncclTaskColl* info, + int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo/* = NULL*/ + ) { + size_t nBytes = ncclTypeSize(info->datatype)*ncclFuncMaxSendRecvCount(info->func, comm->nRanks, info->count); + info->algorithm = NCCL_ALGO_UNDEF; + info->protocol = NCCL_PROTO_UNDEF; + int nMaxChannels = 0; + int backupAlgo = NCCL_ALGO_UNDEF; + int backupProto = NCCL_PROTO_UNDEF; + float backupTime = 3600000000.0; + float collCostTable[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + initCollCostTable((float **)collCostTable); + NCCLCHECK(updateCollCostTable(comm, info, nBytes, collNetSupport, nvlsSupport, numPipeOps, (float **)collCostTable, &backupAlgo, &backupProto, &backupTime)); + if (comm->tuner != NULL) { + NCCLCHECK(comm->tuner->getCollInfo( + comm->tunerContext, info->func, nBytes, + numPipeOps, (float **)collCostTable, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + &nMaxChannels)); + } + NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, backupAlgo, backupProto, backupTime, simInfo)); + info->nMaxChannels = nMaxChannels == 0 ? info->nMaxChannels : nMaxChannels; return ncclSuccess; } NCCL_PARAM(NvlsTreeMaxChunkSize, "NVLSTREE_MAX_CHUNKSIZE", -2); -static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nBytes, int nChannels) { - int stepSize = collInfo->comm->buffSizes[collInfo->protocol] / NCCL_STEPS; - int chunkSteps = (collInfo->protocol == NCCL_PROTO_SIMPLE && collInfo->algorithm == NCCL_ALGO_RING) ? collInfo->chunkSteps : 1; - int sliceSteps = (collInfo->protocol == NCCL_PROTO_SIMPLE && collInfo->algorithm == NCCL_ALGO_RING) ? collInfo->sliceSteps : 1; - int chunkSize = stepSize * chunkSteps; +static ncclResult_t calcCollChunking( + struct ncclComm* comm, struct ncclTaskColl* info, int nChannels, size_t nBytes, + /*outputs*/uint32_t* outChunkSize, uint32_t* outDirectFlags, struct ncclProxyOp* proxyOp + ) { + ncclPattern_t pattern; + size_t grainSize = ncclProtoGrainSize(info->protocol); + + switch (info->func) { + case ncclFuncBroadcast: + pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; + break; + case ncclFuncReduce: + pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; + break; + case ncclFuncReduceScatter: + case ncclFuncAllGather: + pattern = + info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls : + info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect : + ncclPatternRing; + break; + case ncclFuncAllReduce: + pattern = + info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls : + info->algorithm == NCCL_ALGO_NVLS_TREE ? ncclPatternNvlsTree : + info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect : + info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain : + info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : + ncclPatternRingTwice; + break; + default: + WARN("Unknown pattern for collective %d algorithm %d", info->func, info->algorithm); + return ncclInternalError; + } + + int nstepsPerLoop, nchunksPerLoop; + switch (pattern) { + case ncclPatternTreeUp: + case ncclPatternTreeDown: + case ncclPatternTreeUpDown: + case ncclPatternPipelineFrom: + case ncclPatternPipelineTo: + case ncclPatternCollnetChain: + nstepsPerLoop = nchunksPerLoop = 1; + break; + case ncclPatternNvls: + nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads; + break; + case ncclPatternCollnetDirect: + nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].collnetDirect.nHeads; + break; + case ncclPatternRing: + nstepsPerLoop = comm->nRanks-1; nchunksPerLoop = comm->nRanks; + break; + case ncclPatternRingTwice: + nstepsPerLoop = 2*(comm->nRanks-1); nchunksPerLoop = comm->nRanks; + break; + case ncclPatternNvlsTree: + nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads; + break; + default: + WARN("Unknown pattern %d", pattern); + return ncclInternalError; + } - if (collInfo->protocol == NCCL_PROTO_LL) chunkSize /= 2; - if (collInfo->protocol == NCCL_PROTO_LL128) chunkSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS; + int stepSize = comm->buffSizes[info->protocol]/NCCL_STEPS; + int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1; + int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1; + int chunkSize = stepSize*chunkSteps; + if (info->protocol == NCCL_PROTO_LL) chunkSize /= 2; + if (info->protocol == NCCL_PROTO_LL128) chunkSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS; - if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) { + if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { // Optimize chunkSize / nSteps - while (nBytes / (nChannels * collInfo->comm->channels[0].collnetDirect.nHeads * chunkSize) < collInfo->comm->channels[0].collnetDirect.depth * 64 && chunkSize > 131072) chunkSize /= 2; - while (nBytes / (nChannels * collInfo->comm->channels[0].collnetDirect.nHeads * chunkSize) < collInfo->comm->channels[0].collnetDirect.depth * 8 && chunkSize > 65536) chunkSize /= 2; - while (nBytes / (nChannels * collInfo->comm->channels[0].collnetDirect.nHeads * chunkSize) < collInfo->comm->channels[0].collnetDirect.depth * 8 && chunkSize > 32768) chunkSize /= 2; - } else if (collInfo->algorithm == NCCL_ALGO_COLLNET_CHAIN) { - stepSize = collInfo->comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS; + while (nBytes / (nChannels * comm->channels[0].collnetDirect.nHeads * chunkSize) < comm->channels[0].collnetDirect.depth * 64 && chunkSize > 131072) chunkSize /= 2; + while (nBytes / (nChannels * comm->channels[0].collnetDirect.nHeads * chunkSize) < comm->channels[0].collnetDirect.depth * 8 && chunkSize > 65536) chunkSize /= 2; + while (nBytes / (nChannels * comm->channels[0].collnetDirect.nHeads * chunkSize) < comm->channels[0].collnetDirect.depth * 8 && chunkSize > 32768) chunkSize /= 2; + } else if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) { + stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS; chunkSize = std::min(256 * 1024, stepSize * chunkSteps); - while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth * 64 && chunkSize > 131072) chunkSize /= 2; - while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth * 8 && chunkSize > 65536) chunkSize /= 2; - while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2; - } else if (collInfo->algorithm == NCCL_ALGO_NVLS) { - int maxChunkSize = collInfo->comm->nvlsChunkSize; - if (collInfo->comm->nNodes > 1 && collInfo->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768; + while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth * 64 && chunkSize > 131072) chunkSize /= 2; + while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth * 8 && chunkSize > 65536) chunkSize /= 2; + while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2; + } else if (info->algorithm == NCCL_ALGO_NVLS) { + int maxChunkSize = comm->nvlsChunkSize; + if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768; if (chunkSize > maxChunkSize) chunkSize = maxChunkSize; // Use uint64_t so that concurrentOps*chunkSize*X does not overflow - uint64_t concurrentOps = nChannels * collInfo->comm->channels[0].nvls.nHeads; + uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads; if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768; if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384; - } else if (collInfo->algorithm == NCCL_ALGO_NVLS_TREE) { + } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) { // Use uint64_t so that concurrentOps*chunkSize*X does not overflow - uint64_t concurrentOps = nChannels * collInfo->comm->channels[0].nvls.nHeads; - chunkSize = collInfo->comm->nvlsChunkSize; + uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads; + chunkSize = comm->nvlsChunkSize; int maxChunkSize = (int)ncclParamNvlsTreeMaxChunkSize(); - if (maxChunkSize == -2) maxChunkSize = collInfo->comm->nNodes >= 4 ? 65536 : chunkSize; + if (maxChunkSize == -2) maxChunkSize = comm->nNodes >= 4 ? 65536 : chunkSize; chunkSize = std::min(chunkSize, maxChunkSize); if ((nBytes < (32 * (concurrentOps * chunkSize))) && (chunkSize > 262144)) chunkSize = 262144; if ((nBytes < (16 * (concurrentOps * chunkSize))) && (chunkSize > 131072)) chunkSize = 131072; if ((nBytes < (4 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; if ((nBytes < (1 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768; - } else if (collInfo->algorithm == NCCL_ALGO_TREE && collInfo->protocol == NCCL_PROTO_LL128) { - int nNodes = collInfo->comm->nNodes; - float ppn = collInfo->comm->nRanks / (float)nNodes; + } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) { + int nNodes = comm->nNodes; + float ppn = comm->nRanks / (float)nNodes; float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn; while (nBytes / (nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2; while (nBytes / (nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2; } - collInfo->chunkSize = chunkSize; - collInfo->chunkCount = chunkSize / ncclTypeSize(collInfo->datatype); - collInfo->chunkSteps = chunkSteps; - collInfo->sliceSteps = sliceSteps; - collInfo->stepSize = stepSize; - return ncclSuccess; -} - -static ncclResult_t initCollProxyOp(struct ncclInfo* collInfo, int channelId, uint64_t opCount, uint32_t nsteps, struct ncclProxyOp* proxyOp) { - proxyOp->nsteps = nsteps; - proxyOp->sliceSteps = collInfo->sliceSteps; - proxyOp->chunkSteps = collInfo->chunkSteps; - proxyOp->chunkSize = collInfo->chunkSize; - proxyOp->protocol = collInfo->protocol; - proxyOp->dtype = collInfo->datatype; - // Network sees avg as sum - proxyOp->redOp = collInfo->opFull.op == ncclDevPreMulSum || collInfo->opFull.op == ncclDevSumPostDiv ? ncclSum : collInfo->opFull.proxyOp; - proxyOp->pattern = collInfo->pattern; - proxyOp->coll = collInfo->coll; - proxyOp->root = collInfo->root; + // Compute directFlags of work struct. + if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { + // Set direct direction for broadcast-gather (read or write) + *outDirectFlags = (nBytes/nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ; + } else { + *outDirectFlags = 0; + } + + // Compute nSteps for proxies + //if (comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->func, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol); + chunkSize = chunkSize / grainSize * grainSize; // align chunkSize to multiple grainSize + int nLoops = (int)DIVUP(nBytes, size_t(nChannels)*nchunksPerLoop*chunkSize); + memset(proxyOp, 0, sizeof(*proxyOp)); + proxyOp->nsteps = nstepsPerLoop * nLoops * chunkSteps; + proxyOp->sliceSteps = sliceSteps; + proxyOp->chunkSteps = chunkSteps; + proxyOp->chunkSize = chunkSize; + proxyOp->protocol = info->protocol; + proxyOp->dtype = info->datatype; + if (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv) { + proxyOp->redOp = ncclSum; // Network sees avg as sum + } else { + proxyOp->redOp = info->opHost; + } + proxyOp->pattern = pattern; + proxyOp->coll = info->func; + proxyOp->root = info->root; // This is used by P2P to reduce the receive buffer size. We don't use it in collectives // because some protocols need to transmit more than the total size, plus they sometimes // round up - proxyOp->nbytes = collInfo->stepSize * proxyOp->sliceSteps; - if (collInfo->regBufType == NCCL_COLLNET_REG_BUFFER) { + proxyOp->nbytes = stepSize*sliceSteps; + + if (info->regBufType == NCCL_COLLNET_REG_BUFFER) { proxyOp->reg = 1; - proxyOp->nsteps = DIVUP(collInfo->nBytes, NCCL_MAX_COLLNET_SIZE); - proxyOp->sendMhandle = collInfo->sendMhandle; - proxyOp->recvMhandle = collInfo->recvMhandle; - proxyOp->sendbuff = (uint8_t*)collInfo->sendbuff; - proxyOp->recvbuff = (uint8_t*)collInfo->recvbuff; - proxyOp->nbytes = collInfo->nBytes; + proxyOp->nsteps = DIVUP(nBytes, NCCL_MAX_COLLNET_SIZE); + proxyOp->sendMhandle = info->sendMhandle; + proxyOp->recvMhandle = info->recvMhandle; + proxyOp->sendbuff = (uint8_t*)info->sendbuff; + proxyOp->recvbuff = (uint8_t*)info->recvbuff; + proxyOp->nbytes = nBytes; } else { proxyOp->reg = 0; } - proxyOp->channelId = channelId; - proxyOp->opCount = opCount; - - if (collInfo->pattern == ncclPatternCollnetDirect) { - proxyOp->specifics.collnetDirect.nNodes = collInfo->comm->nNodes; - proxyOp->specifics.collnetDirect.node = collInfo->comm->node; - if (collInfo->coll == ncclFuncAllGather || collInfo->coll == ncclFuncReduceScatter) { - proxyOp->specifics.collnetDirect.sizePerRank = collInfo->count * ncclTypeSize(collInfo->datatype); + if (pattern == ncclPatternCollnetDirect) { + proxyOp->specifics.collnetDirect.nNodes = comm->nNodes; + proxyOp->specifics.collnetDirect.node = comm->node; + if (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) { + proxyOp->specifics.collnetDirect.sizePerRank = info->count*ncclTypeSize(info->datatype); } } + + *outChunkSize = chunkSize; return ncclSuccess; } @@ -1894,26 +1934,12 @@ static ncclResult_t hostToDevRedOp( return ncclSuccess; } -static int collCmp(struct ncclInfo *a, struct ncclInfo *b) { - if (a->coll > b->coll) - return 1; - else if (a->coll == b->coll && a->datatype > b->datatype) - return 1; - else if (a->coll == b->coll && a->datatype == b->datatype && a->opFull.op > b->opFull.op) - return 1; - else if (a->coll == b->coll && a->datatype == b->datatype && a->opFull.op == b->opFull.op && a->count > b->count) - return 1; - else - return -1; -} - -// Converts `info` to a task and adds it to `comm->tasks`. The exception is with +// Converts `info` to a task and adds it to `comm->planner`. The exception is with // single rank communicators, collectives are issued as `ncclMemcpyAsync`s and // thus don't need a task. static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { - ncclTasks *tasks = &comm->tasks; + struct ncclKernelPlanner *planner = &comm->planner; - if (info->count == 0 && info->coll != ncclFuncSend && info->coll != ncclFuncRecv) return ncclSuccess; if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { int peer = info->root; ssize_t nBytes = info->count*ncclTypeSize(info->datatype); @@ -1924,21 +1950,23 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { struct ncclTaskP2p* p2p = ncclMemoryStackAlloc(&comm->memScoped); p2p->buff = (void*)info->recvbuff; p2p->bytes = nBytes; - p2p->chunk = 0; ncclIntruQueueEnqueue( - isSendNotRecv ? &tasks->peers[peer].sendQueue : &tasks->peers[peer].recvQueue, + isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue, p2p); - tasks->nTasksP2p += 1; + planner->nTasksP2p += 1; // Mark channels that need pre-connect if (comm->rank != peer) { - int channelBaseId; - NCCLCHECK(ncclChannelComputeBase(comm, peer, info->coll, &channelBaseId)); - if (!(isSendNotRecv ? tasks->peers[peer].sendSeen : tasks->peers[peer].recvSeen)) { - (isSendNotRecv ? tasks->peers[peer].sendSeen : tasks->peers[peer].recvSeen) = true; + if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) { + (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true; + int round = 0; + while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank + : comm->p2pSchedule[round].recvRank)) { + round += 1; + } + uint8_t base = ncclP2pChannelBaseForRound(comm, round); for (int c=0; c < comm->p2pnChannelsPerPeer; c++) { - int channelId; - NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId)); + int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c); if (isSendNotRecv) { if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector comm->connectSend[peer] |= (1UL<count == 0) return ncclSuccess; + // Copy reduction op state from op handle into info struct here since the // op handle may be destroyed before ncclGroupEnd(). - NCCLCHECK(hostToDevRedOp(&info->opFull, info->op, info->datatype, comm)); + struct ncclDevRedOpFull opDev; + NCCLCHECK(hostToDevRedOp(&opDev, info->op, info->datatype, comm)); if (comm->nRanks == 1) { - NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, info->opFull, info->datatype, info->stream)); + NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opDev, info->datatype, info->stream)); return ncclSuccess; } else { // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. ncclGroupCommJoin(info->comm); - struct ncclInfo* t = ncclMemoryStackAlloc(&comm->memScoped); - info->nChannels = 0; - info->nThreads = 0; - info->algorithm = NCCL_ALGO_UNDEF; - info->protocol = NCCL_PROTO_UNDEF; - info->userTuned = false; - memcpy(t, info, sizeof(struct ncclInfo)); - ncclIntruQueueSortEnqueue(&tasks->collQueue, t, collCmp); - tasks->workBytesTotal += info->count * ncclTypeSize(info->datatype); - tasks->nTasksColl += 1; + struct ncclTaskColl* t = ncclMemoryStackAlloc(&comm->memScoped); + t->func = info->coll; + t->sendbuff = info->sendbuff; + t->recvbuff = info->recvbuff; + t->count = info->count; + t->root = info->root; + t->datatype = info->datatype; + size_t elementSize = ncclTypeSize(t->datatype); + if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast) { + t->count *= elementSize; + t->datatype = ncclInt8; + elementSize = 1; + } + t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks); + t->opHost = info->op; + t->opDev = opDev; // C++ struct assignment + t->chunkSteps = info->chunkSteps; + t->sliceSteps = info->sliceSteps; + + planner->nTasksColl += 1; + ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes); } } - if (info->stream != tasks->streamRecent || tasks->streams == nullptr) { - tasks->streamRecent = info->stream; - struct ncclCudaStreamList* l = tasks->streams; + if (info->stream != planner->streamRecent || planner->streams == nullptr) { + planner->streamRecent = info->stream; + struct ncclCudaStreamList* l = planner->streams; while (true) { if (l == nullptr) { // Got to the end, this must be a new stream. struct ncclCudaGraph graph; NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream)) - if (tasks->streams != nullptr && !ncclCudaGraphSame(tasks->capturingGraph, graph)) { + if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) { WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph."); return ncclInvalidUsage; } - tasks->capturingGraph = graph; // C++ struct assignment + planner->capturingGraph = graph; // C++ struct assignment // Add stream to list l = ncclMemoryStackAlloc(&comm->memScoped); l->stream = info->stream; - l->next = tasks->streams; - tasks->streams = l; + l->next = planner->streams; + planner->streams = l; break; } if (l->stream == info->stream) @@ -2019,10 +2062,10 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { } NCCLCHECKGOTO(ArgsCheck(info), ret, fail); - INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", + INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zu datatype %d op %d root %d comm %p [nranks=%d] stream %p", info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); - TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zi,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast(info->sendbuff), reinterpret_cast(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream); + TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zu,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast(info->sendbuff), reinterpret_cast(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream); NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail); diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 90687bb6a..b1b99d4e3 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -5,7 +5,9 @@ ************************************************************************/ #include "comm.h" +#include "device.h" #include "graph.h" +#include "transport.h" #include "trees.h" #include "rings.h" #include "topo.h" @@ -84,6 +86,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0]; } } + memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum); return ncclSuccess; } @@ -188,7 +191,7 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; char line[1024]; - sprintf(line, "CollNet channel %d rank %d ", c, rank); + sprintf(line, "CollNetDirect channel %d rank %d ", c, rank); int nDown = 0; for (int i=0; i MAXCHANNELS) maxNchannels = MAXCHANNELS; if (maxNchannels < 1) { WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels); @@ -363,6 +370,8 @@ void exchangeValues(int* v0, int* v1) { *v0 = tmp; } +NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1); + ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) { // Gather data from all ranks int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads; @@ -444,13 +453,13 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa // Setup CollNet if (comm->collNetSupport == 1) { - struct ncclTopoGraph* collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT]; + struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN]; // Add more channels to saturate intra-node bandwidth, except the 1 PPN case - if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) { + if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) { int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2); nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext); } - NCCLCHECK(connectCollNet(comm, collNetGraph)); + NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT])); } // Use 4 compute channels per search channel to reach peak BW on <8 PPN @@ -458,6 +467,12 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext); } + // Double the number of channels when using unpack networking (greater than 1 node) + // We won't automatically double past 16 channels, users can specify 32 if they want + if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) { + nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext); + } + // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS. // We permit combining max, then min, to only use the first channels, then duplicate them. if (comm->sharedRes->owner != comm) { diff --git a/src/graph/paths.cc b/src/graph/paths.cc index e033c5b45..1380d2449 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -10,6 +10,8 @@ #include "comm.h" #include "net.h" #include "channel.h" +#include "transport.h" +#include "device.h" // Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths @@ -732,12 +734,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1); NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS); - -static int nextPow2(int v) { - int pow2 = 1; - while (pow2 < v) pow2 <<= 1; - return pow2; -} +extern int64_t ncclParamWorkArgsBytes(); ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { /* here we already honor comm->max/minCTAs for p2pnChannels. */ @@ -759,19 +756,17 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { } } - // Round to next pow2 nChannelsPerPeer and nChannels - comm->p2pnChannelsPerPeer = nextPow2(minChannels); - comm->p2pnChannels = nextPow2(comm->p2pnChannels); + // Make nChannelsPerPeer and nChannels powers of 2. This is relied on when + // mapping p2p peers to channels. + comm->p2pnChannelsPerPeer = pow2Up(minChannels); + comm->p2pnChannels = pow2Up(comm->p2pnChannels); + + comm->p2pnChannels = std::min(comm->p2pnChannels, pow2Down(ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()))); + comm->p2pnChannelsPerPeer = std::min(comm->p2pnChannelsPerPeer, comm->p2pnChannels); // Init channels that weren't used so far for (int c=comm->nChannels; cp2pnChannels; c++) NCCLCHECK(initChannel(comm, c)); - // We want to spread channels used when there aren't many and progressively - // fill the whole space of nChannels. To do so we mirror the bits in the - // nChannels space. - for (int c=0; cp2pnChannels; c++) { - comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels); - } return ncclSuccess; } diff --git a/src/graph/search.cc b/src/graph/search.cc index c7b4d96ae..7f16cb769 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -8,6 +8,7 @@ #include "core.h" #include "graph.h" #include "topo.h" +#include "transport.h" #include "xml.h" #include @@ -51,6 +52,15 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) { return ncclSuccess; } +ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm) { + // We assume there is at least one CPU and that the CPUs have the same + // architecture and vendor. + const struct ncclTopoNodeSet* cpus = &comm->topo->nodes[CPU]; + comm->cpuArch = cpus->nodes[0].cpu.arch; + comm->cpuVendor = cpus->nodes[0].cpu.vendor; + return ncclSuccess; +} + static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, int type, struct ncclTopoLink** revLink) { for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node2->links+l; @@ -104,7 +114,7 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod } // Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1). -static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct ncclTopoNode** node) { +static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, float mult, struct ncclTopoNode** node) { // First handle easy cases *node = system->nodes[type2].nodes+index2; if (type1 == -1) return ncclSuccess; @@ -334,6 +344,42 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo return ncclSuccess; } +ncclResult_t ncclTopoSearchTryCollnetDirect(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) { + int fwdg = 0; + int bwdg = 0; + struct ncclTopoNode* gpu = NULL; + float mul = 1.0 / (float)(system->nodes[GPU].count - 1); + do { + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, fwdg, mul, &gpu)); + } while (gpu && ++fwdg < system->nodes[GPU].count); + + if (gpu != NULL) { + do { + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, bwdg, GPU, g, mul, &gpu)); + } while (gpu && ++bwdg < system->nodes[GPU].count); + if (gpu != NULL) { + // Both directions worked. Now we already have head, so pop the all other intra ranks. + int step = 1; + for (int index = 0; index < ngpus; ++index) { + if (index != g) { + graph->intra[graph->nChannels * ngpus + step] = system->nodes[GPU].nodes[index].gpu.rank; + step++; + } + } + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time)); + } + while (bwdg) { + bwdg--; + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, bwdg, GPU, g, -mul, &gpu)); + } + } + while (fwdg) { + fwdg--; + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, fwdg, -mul, &gpu)); + } + return ncclSuccess; +} + ncclResult_t ncclTopoSearchTryNvls(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) { struct ncclTopoNode* nvs; struct ncclTopoNode* gpu; @@ -514,6 +560,8 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo } } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time)); + } else if (graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) { + NCCLCHECK(ncclTopoSearchTryCollnetDirect(system, graph, saveGraph, g, ngpus, time)); } else if (step < system->nodes[GPU].count-1) { // Go to next GPU int next[NCCL_TOPO_MAX_NODES]; @@ -552,9 +600,10 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); int netCount; + int graphFound = 0; NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount)); for (int i=0; ipattern == NCCL_TOPO_PATTERN_NVLS && i>0) continue; + if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) continue; int n = nets[(graph->nChannels+i)%netCount]; struct ncclTopoNode* net = system->nodes[NET].nodes+n; if (graph->collNet && net->net.collSupport == 0) continue; @@ -571,12 +620,22 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } } - if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { + if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) { // NVLS search only tries to find NIC:GPU combinations to compute the heads. if (graph->nChannels < netCount) { int gpu; + int duplicate = 0; NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu)); + // check whether there is duplicate head when one GPU connects with multiple NICs + for (int gc = 0; gc < graph->nChannels; gc++) { + if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) { + duplicate = 1; + break; + } + } + if (duplicate) continue; if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu)); + graphFound = 1; } } else { if (graph->nChannels > 0) { @@ -891,8 +950,9 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph int ccMin; NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL)); if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess; - // NVLS search must have ngpus heads at most. - if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = system->nodes[GPU].count; + // NVLS and COLLNET_DIRECT search must have ngpus heads at most. + if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) + graph->maxChannels = system->nodes[GPU].count; if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; @@ -1104,7 +1164,7 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i exit: return ret; fail: - WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank); + WARN("Could not find NIC for rank %d in NVLS graph", comm->rank); goto exit; } diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 30304582f..d6af9282e 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -11,6 +11,7 @@ #include "nvmlwrap.h" #include "net.h" #include "coll_net.h" +#include "transport.h" #include #include #include "xml.h" @@ -51,7 +52,12 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode* return ncclSuccess; } for (int l=0; lnlinks; l++) { - if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu)); + // Go up the PCI tree to find the CPU. Follow only PCI switches. + if (node->links[l].type == LINK_PCI + && (node->links[l].remNode->type == PCI + || node->links[l].remNode->type == CPU)) { + NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu)); + } if (*cpu != NULL) return ncclSuccess; } return ncclSuccess; @@ -109,11 +115,6 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo n->type = type; n->id = id; if (type == GPU) { - // Create link to itself (used in some corner cases) - n->nlinks=1; - n->links[0].type = LINK_LOC; - n->links[0].remNode = n; - n->links[0].bw = LOC_BW; n->gpu.dev = NCCL_TOPO_UNDEF; n->gpu.rank = NCCL_TOPO_UNDEF; n->gpu.cudaCompCap = NCCL_TOPO_UNDEF; @@ -279,8 +280,10 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node->links+l; - if (link->type == LINK_LOC) continue; - if (link->type != LINK_PCI || link->remNode != prevNode) { + if (link->type == LINK_LOC) { + sprintf(line+offset, "+ %s[%2.1f] - %s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], link->remNode->id); + INFO(NCCL_GRAPH, "%s", line); + } else if (link->type != LINK_PCI || link->remNode != prevNode) { sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw); int nextOffset = strlen(line); if (link->type == LINK_PCI) { @@ -443,7 +446,9 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s for (int s=0; snSubs; s++) { struct ncclXmlNode* xmlSubPci = xmlPci->subs[s]; - NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId)); + if (strcmp(xmlSubPci->name, "pcilink") != 0) { // PCI links will be added later + NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId)); + } } } @@ -579,6 +584,38 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* return ncclSuccess; } +ncclResult_t ncclTopoAddPciLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) { + if (strcmp(node->name, "pcilink") == 0) { + struct ncclTopoNode* pci = NULL; + int64_t pBusId; + NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); + pBusId = NCCL_TOPO_ID(systemId, pBusId); + NCCLCHECK(ncclTopoGetNode(system, &pci, PCI, pBusId)); + if (pci == NULL) { + WARN("Add PCI Link error : could not find PCI SW %lx", pBusId); + return ncclInternalError; + } + struct ncclTopoNode* remote = NULL; + const char* target; + NCCLCHECK(xmlGetAttrStr(node, "target", &target)); + int64_t busId; + NCCLCHECK(busIdToInt64(target, &busId)); + NCCLCHECK(ncclTopoGetNode(system, &remote, PCI, NCCL_TOPO_ID(systemId, busId))); + if (remote) NCCLCHECK(ncclTopoConnectNodes(pci, remote, LINK_LOC, LOC_BW)); + } else { + if (strcmp(node->name, "cpu") == 0) { + NCCLCHECK(ncclGetSystemId(system, node, &systemId)); + } + const char* busId; + NCCLCHECK(xmlGetAttr(node, "busid", &busId)); + for (int s=0; snSubs; s++) { + NCCLCHECK(ncclTopoAddPciLinks(node->subs[s], system, busId ? busId : parentBusId, systemId)); + } + } + return ncclSuccess; +} + + ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) { if (strcmp(node->name, "c2c") == 0) { struct ncclTopoNode* gpu = NULL; @@ -626,6 +663,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0)); NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0)); + NCCLCHECK(ncclTopoAddPciLinks(topNode, *topoSystem, NULL, 0)); NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem)); NCCLCHECK(ncclTopoConnectCpus(*topoSystem)); @@ -668,6 +706,18 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN return ncclSuccess; } +ncclResult_t ncclTopoRefreshBcmP2pLinks(void) { + //refresh the switch topology by reading the link below + FILE *fp = fopen("/sys/kernel/pci_switch_link/refresh_switch_toplogy", "r"); + if (fp != NULL) { + int tmp; + size_t r = fread(&tmp, sizeof(tmp), 1, fp); + if (r != 1) + INFO(NCCL_GRAPH, "Failed to read refresh_switch_toplogy"); + fclose(fp); + } + return ncclSuccess; +} ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { struct ncclXml* xml; @@ -687,18 +737,17 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION)); } - // Auto-detect GPUs if needed - for (int r=0; rnRanks; r++) { - if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) { - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId)); - struct ncclXmlNode* node; - NCCLCHECK(ncclTopoFillGpu(xml, busId, &node)); - if (node == NULL) continue; - NCCLCHECK(xmlSetAttrInt(node, "keep", 1)); - NCCLCHECK(xmlSetAttrInt(node, "rank", r)); - NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport)); - } + NCCLCHECK(ncclTopoRefreshBcmP2pLinks()); + + // Detect only the GPU managed by this process. We'll get any others through XML fusion. + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + NCCLCHECK(int64ToBusId(comm->peerInfo[comm->rank].busId, busId)); + struct ncclXmlNode* node; + NCCLCHECK(ncclTopoFillGpu(xml, busId, &node)); + if (node) { + NCCLCHECK(xmlSetAttrInt(node, "keep", 1)); + NCCLCHECK(xmlSetAttrInt(node, "rank", comm->rank)); + NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport)); } // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, // so we start with collnet so that it has precedence. @@ -728,6 +777,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy for (int n=0; nncclNet->getProperties(n, &props)); + comm->netDeviceType = props.netDeviceType; struct ncclXmlNode* netNode; NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode)); NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1)); @@ -745,24 +795,46 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy // Remove XML branches which don't have a node with keep="1" (typically when importing a topology) NCCLCHECK(ncclTopoTrimXml(xml)); + // XML topo fusion. + int* localRanks; + int localRank = -1, nLocalRanks = 0; if (comm->MNNVL) { // MNNVL clique support - char* mem; - NCCLCHECK(ncclCalloc(&mem, comm->clique.size * xmlMemSize(NCCL_TOPO_XML_MAX_NODES))); - struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*comm->cliqueRank); - memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)); - NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1)); - NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES))); - struct ncclXml* cliqueXml; - NCCLCHECK(xmlAlloc(&cliqueXml, comm->clique.size*NCCL_TOPO_XML_MAX_NODES)); - for (int i = 0; i < comm->clique.size; i++) { - struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i); - NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0)); - NCCLCHECK(ncclTopoFuseXml(cliqueXml, peerXml)); + nLocalRanks = comm->clique.size; + localRank = comm->cliqueRank; + localRanks = comm->clique.ranks; + } else { + // Intra-node fusion. Much of the comm is not initialized yet at this point so we need to do our own calculations. + NCCLCHECK(ncclCalloc(&localRanks, comm->nRanks)); + for (int i = 0; i < comm->nRanks; i++) { + if (comm->peerInfo[i].hostHash == comm->peerInfo[comm->rank].hostHash) { + if (i == comm->rank) + localRank = nLocalRanks; + localRanks[nLocalRanks++] = i; + } } + } + char* mem; + NCCLCHECK(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES))); + struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank); + memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)); + NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1)); + NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES))); + if (comm->MNNVL) { + // Ensure that we have enough room when fusing topos from multiple nodes. free(xml); - xml = cliqueXml; + NCCLCHECK(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES)); + } else { + // In the intra-node case there's no need to enlarge the topo xml. + xml->maxIndex = 0; + free(localRanks); + } + for (int i = 0; i < nLocalRanks; i++) { + struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i); + NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0)); + NCCLCHECK(ncclTopoFuseXml(xml, peerXml)); } + free(mem); xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE"); if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) { diff --git a/src/graph/topo.h b/src/graph/topo.h index 548747913..6613f3271 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -218,7 +218,7 @@ static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id return ncclSuccess; } } - WARN("Could not find NET with id %lx\n", id); + WARN("Could not find NET with id %lx", id); return ncclInternalError; } diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index d8f0b6e44..f9d814a25 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -110,11 +110,9 @@ NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2); static float getNetOverhead(struct ncclComm* comm) { if (ncclParamNetOverhead() != -2) return ncclParamNetOverhead() * .001; - int cpuArch, cpuVendor, cpuModel; - NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); - if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0; - if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0; - else return 1.0; + if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0; + if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0; + return 1.0; } ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) { @@ -317,6 +315,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom } if (pEnable == 0) comm->bandwidths[c][a][p] = 0; if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; + if (a == NCCL_ALGO_RING && pEnable == 0) comm->ringbdw[c][p] = 0; } for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) { @@ -415,15 +414,15 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = { { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .6, .7, .8, .7, .7, .8, .9, .9 } }; -ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup) { - float bw = info->comm->bandwidths[info->coll][algorithm][protocol]; - float lat = info->comm->latencies[info->coll][algorithm][protocol]; +ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup) { + float bw = comm->bandwidths[coll][algorithm][protocol]; + float lat = comm->latencies[coll][algorithm][protocol]; if (backup) { *backup = false; if (algorithm == NCCL_ALGO_RING && bw == 0.0f) { /* try back up RING algorithm */ - bw = info->comm->ringbdw[info->coll][protocol]; + bw = comm->ringbdw[coll][protocol]; *backup = true; } } @@ -431,15 +430,14 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto if (bw == 0) { *time = -1.0; return ncclSuccess; } - int logSize = log2i(info->nBytes>>6); - if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize]; - if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels; - if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1 - && info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) { - lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring + int logSize = log2i(nBytes>>6); + if (algorithm == NCCL_ALGO_TREE && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize]; + if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1 + && coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) { + lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring } // Tree pipelining saves latency in aggregation cases - int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS); - *time = lat * latCount + (info->nBytes) / (1000 * bw); + int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_DEV_WORK_BATCH_COLLS); + *time = lat * latCount + nBytes / (1000 * bw); return ncclSuccess; } diff --git a/src/graph/xml.cc b/src/graph/xml.cc index b145d34ef..c2c6a1c81 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -272,56 +272,34 @@ ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml) return ncclSuccess; } +static ncclResult_t xmlTopoFuseXmlRecursive(struct ncclXml* dst, struct ncclXmlNode* dstParent, struct ncclXmlNode* srcParent) { + for (int i = 0; i < srcParent->nSubs; i++) { + struct ncclXmlNode* srcNode = srcParent->subs[i]; + struct ncclXmlNode* dstNode; + NCCLCHECK(xmlFindNode(dstParent, srcNode, &dstNode)); + if (dstNode == NULL) { + NCCLCHECK(xmlAddTree(dst, dstParent, srcNode)); + } else { + NCCLCHECK(xmlTopoFuseXmlRecursive(dst, dstNode, srcNode)); + } + } + return ncclSuccess; +} + ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src) { - struct ncclXmlNode* topNode; - NCCLCHECK(xmlFindTag(dst, "system", &topNode)); + struct ncclXmlNode* topNodeDst; + NCCLCHECK(xmlFindTag(dst, "system", &topNodeDst)); - if (topNode == NULL) { + if (topNodeDst == NULL) { xmlAddTree(dst, NULL, src->nodes); return ncclSuccess; } - // Fuse the CPUs with the first XML - struct ncclXmlNode* srcCpu; - NCCLCHECK(xmlFindTag(src, "cpu", &srcCpu)); - while (srcCpu) { - const char* srcNumaId; - const char* srcHostHash; - NCCLCHECK(xmlGetAttr(srcCpu, "numaid", &srcNumaId)); - if (srcNumaId == NULL) { - WARN("TopoFuseXmls : could not find CPU numa ID."); - return ncclInternalError; - } - xmlGetAttr(srcCpu, "host_hash", &srcHostHash); - if (srcHostHash == NULL) - srcHostHash = "0"; - - // Search through the destination for a duplicate. Note that - // this makes the complexity of this whole function O(n^2), but n - // is expected to be small. - struct ncclXmlNode* dstCpu; - NCCLCHECK(xmlFindTag(dst, "cpu", &dstCpu)); - while (dstCpu) { - const char* dstNumaId; - const char* dstHostHash; - NCCLCHECK(xmlGetAttr(dstCpu, "numaid", &dstNumaId)); - if (dstNumaId == NULL) { - WARN("TopoFuseXmls : could not find CPU numa ID."); - return ncclInternalError; - } - xmlGetAttr(dstCpu, "host_hash", &dstHostHash); - if (dstHostHash == NULL) - dstHostHash = "0"; - if (strcmp(srcNumaId, dstNumaId) == 0 && strcmp(srcHostHash, dstHostHash) == 0) - break; + struct ncclXmlNode* topNodeSrc; + NCCLCHECK(xmlFindTag(src, "system", &topNodeSrc)); + + NCCLCHECK(xmlTopoFuseXmlRecursive(dst, topNodeDst, topNodeSrc)); - NCCLCHECK(xmlFindNextTag(dst, "cpu", dstCpu, &dstCpu)); - } - // Only add the CPU if no duplicate was found - if (dstCpu == NULL) - NCCLCHECK(xmlAddTree(dst, topNode, srcCpu)); - NCCLCHECK(xmlFindNextTag(src, "cpu", srcCpu, &srcCpu)); - } return ncclSuccess; } @@ -335,6 +313,11 @@ ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclX return ncclSuccess; } +ncclResult_t ncclTopoXmlLoadPciLink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); + return ncclSuccess; +} + ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); return ncclSuccess; @@ -357,8 +340,8 @@ ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ncclXmlN } ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { - struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic} }; - NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3)); + struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic}, { "pcilink", ncclTopoXmlLoadPciLink} }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 4)); return ncclSuccess; } @@ -423,6 +406,28 @@ static ncclResult_t getPciPath(const char* busId, char** path) { return ncclSuccess; } +#include +static ncclResult_t getBcmLinks(const char* busId, int* nlinks, char** peers) { + *nlinks = 0; + *peers = NULL; + char dirPath[] = "/sys/kernel/pci_switch_link/virtual_switch_links/0000:00:00.0"; + memcpylower(dirPath+sizeof("/sys/kernel/pci_switch_link/virtual_switch_links/")-1, busId, BUSID_SIZE-1); + DIR *dir = opendir(dirPath); + if (dir) { + struct dirent* file; + while ((file = readdir(dir)) != NULL) { + if (strlen(file->d_name) != BUSID_SIZE-1) continue; + char* path; + if (getPciPath(file->d_name, &path) == ncclSystemError) continue; + free(path); + NCCLCHECK(ncclRealloc(peers, (*nlinks)*BUSID_SIZE, ((*nlinks)+1)*BUSID_SIZE)); + memcpy((*peers)+BUSID_SIZE*(*nlinks)++, file->d_name, BUSID_SIZE); + } + closedir(dir); + } + return ncclSuccess; +} + ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) { char filePath[PATH_MAX]; sprintf(filePath, "%s/%s", path, fileName); @@ -541,10 +546,11 @@ ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct n // There can be trailing chars. int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); } int checkBDFFormat(char* bdf) { - if (bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.') return 0; - if (isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) || - isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) || isHex(bdf[9] == 0) || - isHex(bdf[11] == 0)) return 0; + if (strlen(bdf) != 12) return 0; + if ((bdf[4] != ':') || (bdf[7] != ':') || (bdf[10] != '.')) return 0; + if ((isHex(bdf[0]) == 0) || (isHex(bdf[1]) == 0) || (isHex(bdf[2]) == 0) || (isHex(bdf[3]) == 0) || + (isHex(bdf[5]) == 0) || (isHex(bdf[6]) == 0) || (isHex(bdf[8]) == 0) || (isHex(bdf[9]) == 0) || + (isHex(bdf[11]) == 0)) return 0; return 1; } @@ -608,6 +614,24 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* NCCLCHECK(xmlSetAttr(pciNode, "link_width", "")); } } + + const char* vendor; + NCCLCHECK(xmlGetAttr(pciNode, "vendor", &vendor)); + if (vendor != NULL && strcmp(vendor, "0x1000") == 0) { // BCM switch, look for P2P connections + int nlinks; + char* peers; + NCCLCHECK(getBcmLinks(busId, &nlinks, &peers)); + for (int l=0; lparent; if (parent == NULL) { if (path) { @@ -911,25 +935,33 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha return ncclSuccess; } -ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node) { +ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node, int* keep) { const char* str; NCCLCHECK(xmlGetAttr(node, "keep", &str)); if (str && strcmp(str, "1") == 0) { NCCLCHECK(xmlUnsetAttr(node, "keep")); + *keep = 1; } else { // Copy nSubs and subs as they could change as we trim recursively. struct ncclXmlNode* subs[MAX_SUBS]; int nSubs = node->nSubs; memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*)); + *keep = 0; for (int s=0; sname, "pci") == 0 || strcmp(node->name, "cpu") == 0)) { + NCCLCHECK(xmlRemoveNode(node)); } - if (node->nSubs == 0) NCCLCHECK(xmlRemoveNode(node)); } return ncclSuccess; } ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) { - NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes)); + int keep = 0; + NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes, &keep)); return ncclSuccess; } diff --git a/src/graph/xml.h b/src/graph/xml.h index 9090ecc0f..0ee56790b 100644 --- a/src/graph/xml.h +++ b/src/graph/xml.h @@ -55,7 +55,7 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha /* Remove unneeded parts */ ncclResult_t ncclTopoTrimXml(struct ncclXml* xml); -/* Fuse multiple system XMLs into one, skipping duplicate CPUs */ +/* Fuse multiple system XMLs into one, skipping duplicate entries */ ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src); /* Relocate pointers in XML to (de-)serialize the structure */ ncclResult_t ncclTopoConvertXml(struct ncclXml* xml, uintptr_t base, int exp); @@ -172,6 +172,29 @@ static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struc return ncclSuccess; } +static ncclResult_t xmlFindNode(struct ncclXmlNode* parentNode, struct ncclXmlNode* searchNode, struct ncclXmlNode** node) { + *node = NULL; + // Search for the node at the current level only. + for (int i=0; inSubs; i++) { + struct ncclXmlNode* n = parentNode->subs[i]; + if (strcmp(n->name, searchNode->name) == 0 && n->type == searchNode->type && n->nAttrs == searchNode->nAttrs) { + int a; + // Ensure that all the attributes are the same. + for (a=0; anAttrs; a++) { + const char* val; + NCCLCHECK(xmlGetAttr(n, searchNode->attrs[a].key, &val)); + if (!val || strcmp(val, searchNode->attrs[a].value)) + break; + } + if (a == searchNode->nAttrs) { + *node = n; + return ncclSuccess; + } + } + } + return ncclSuccess; +} + static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, const char* value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); diff --git a/src/group.cc b/src/group.cc index eb45e31ac..7158b45c2 100644 --- a/src/group.cc +++ b/src/group.cc @@ -10,6 +10,7 @@ #include "transport.h" #include "channel.h" #include +#include "bootstrap.h" __thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting __thread ncclResult_t ncclGroupError = ncclSuccess; @@ -31,6 +32,7 @@ ncclResult_t ncclAsyncLaunch( ) { ncclResult_t ret = ncclSuccess; + job->destroyFlag = comm->destroyFlag; if (ncclGroupDepth == 0) { ret = func(job); if (ret != ncclSuccess && undo) undo(job); @@ -40,11 +42,15 @@ ncclResult_t ncclAsyncLaunch( job->undo = undo; job->destructor = destructor; job->abortFlag = comm->abortFlag; + job->abortFlagDev = comm->abortFlagDev; job->childAbortFlag = comm->childAbortFlag; + job->childAbortFlagDev = comm->childAbortFlagDev; job->state = ncclGroupJobRunning; job->comm = comm; /* check if there are blocking and nonblocking comms at the same time in group. */ - if (ncclGroupBlocking == -1) { + if (comm->destroyFlag) { + ncclGroupBlocking = 1; + } else if (ncclGroupBlocking == -1) { /* first met communicator */ ncclGroupBlocking = comm->config.blocking; } else if (ncclGroupBlocking != comm->config.blocking) { @@ -98,11 +104,23 @@ ncclResult_t ncclGroupEnd() { return ret; } +NCCL_API(ncclResult_t, ncclGroupSimulateEnd, ncclSimInfo_t* simInfo); +ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo) { + ncclResult_t ret = ncclSuccess; + NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCLCHECKGOTO(ncclGroupEndInternal(simInfo), ret, exit); + TRACE_CALL("ncclGroupSimulateEnd()"); +exit: + return ret; +} + struct ncclPreconnectJob { struct ncclAsyncJob base; struct ncclComm* comm; + bool* algoNeedConnect; }; -ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) { + +ncclResult_t ncclP2PPreconnectFunc(struct ncclAsyncJob* job_) { struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_; struct ncclComm* comm = job->comm; CUDACHECK(cudaSetDevice(comm->cudaDev)); @@ -111,6 +129,57 @@ ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) { return ncclSuccess; } +ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) { + struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_; + struct ncclComm* comm = job->comm; + ncclResult_t ret = ncclSuccess; + + CUDACHECK(cudaSetDevice(comm->cudaDev)); + if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) { + if (job->algoNeedConnect[i]) { + switch (i) { + case NCCL_ALGO_RING: { + NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail); + break; + } + case NCCL_ALGO_TREE: { + NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail); + break; + } + case NCCL_ALGO_NVLS: { + /* If we are using NVLS_TREE algo, we must mark NVLS algo to set up + * NVLS intra-node buffer */ + NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail); + break; + } + case NCCL_ALGO_NVLS_TREE: { + NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail); + break; + } + case NCCL_ALGO_COLLNET_CHAIN: { + NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail); + break; + } + case NCCL_ALGO_COLLNET_DIRECT: { + NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail); + break; + } + default: { + ret = ncclInternalError; + goto fail; + } + } + } + } + +exit: + free(job->algoNeedConnect); + return ret; +fail: + goto exit; +} + static ncclResult_t doLaunches(struct ncclComm* head) { ncclResult_t result = ncclSuccess; struct ncclComm* cliqueComm0 = head->intraComm0; @@ -124,7 +193,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) { struct ncclComm* comm = cliqueHead; bool capturingYes = false, capturingNo = false; do { - (ncclCudaGraphValid(comm->tasks.capturingGraph) ? capturingYes : capturingNo) = true; + (ncclCudaGraphValid(comm->planner.capturingGraph) ? capturingYes : capturingNo) = true; CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure); if (useBarrier) ncclCommIntraBarrierIn(comm, 1); @@ -150,19 +219,19 @@ static ncclResult_t doLaunches(struct ncclComm* head) { // Barrier reduction result tells us if this was the final round. moreRounds = 0 != ncclCommIntraBarrierOut(comm); } else { - moreRounds |= comm->unlaunchedPlansHead != nullptr; + moreRounds |= comm->planner.unlaunchedPlansHead != nullptr; } if (moreRounds) { // Pop next unlaunched kernel - struct ncclKernelPlan* plan = comm->unlaunchedPlansHead; + struct ncclKernelPlan* plan = comm->planner.unlaunchedPlansHead; if (plan != nullptr) { - comm->unlaunchedPlansHead = plan->next; + comm->planner.unlaunchedPlansHead = plan->next; CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure); NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure); } // Barrier reduction input indicates if we require further rounds. - if (useBarrier) ncclCommIntraBarrierIn(comm, comm->unlaunchedPlansHead != nullptr ? 1 : 0); + if (useBarrier) ncclCommIntraBarrierIn(comm, comm->planner.unlaunchedPlansHead != nullptr ? 1 : 0); if (plan != nullptr) { NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure); } @@ -210,37 +279,29 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g // is needed. comm->preconnectNext = reinterpret_cast(0x1); for (int i = 0; i < comm->nRanks; i++) { - comm->tasks.peers[i].sendSeen = false; - comm->tasks.peers[i].recvSeen = false; comm->connectSend[i] = 0UL; comm->connectRecv[i] = 0UL; } - comm->unlaunchedPlansHead = nullptr; // Reclaim abandoned kernel plan memory. Note ncclWork structs were already // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`. - while (!ncclIntruQueueEmpty(&comm->planQueue)) { - struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue); + while (!ncclIntruQueueEmpty(&comm->planner.planQueue)) { + struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planner.planQueue); // Persistent plans will be reclaimed via the callbackQueue when the // graph drops its UserObject reference. if (!plan->persistent) { - for (int c = 0; c < MAXCHANNELS; c++) { - while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) { - struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue); - ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop); - } + while (!ncclIntruQueueEmpty(&plan->proxyOpQueue)) { + struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->proxyOpQueue); + ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop); } ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); } } - // Reset comm->tasks to empty. - comm->tasks.nTasksColl = 0; - comm->tasks.nTasksP2p = 0; - comm->tasks.workBytesTotal = 0; - comm->tasks.streams = nullptr; - ncclIntruQueueConstruct(&comm->tasks.collQueue); - for (int i = 0; i < comm->nRanks; i++) { - ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue); - ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue); + + { // Reset comm->planner to empty. + ncclKernelPlanner::Peer* tmp = comm->planner.peers; + memset(&comm->planner, 0, sizeof(comm->planner)); + comm->planner.peers = tmp; + memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0])); } if (!comm->config.blocking) @@ -260,37 +321,10 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g return; } -static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) { - int savedDev; +static ncclResult_t asyncJobLaunch(struct ncclIntruQueue *asyncJobsMain, volatile bool *groupAbortFlag) { ncclResult_t ret = ncclSuccess; bool jobsDone = false; bool errorJobAbortFlag = false; - struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_; - struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr; - struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr; - struct ncclIntruQueue *asyncJobsMain = gjob->asyncJobsPtr; - volatile bool *groupAbortFlag = gjob->abortFlagPtr; - - CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail); - - if (groupCommPreconnectHeadMain != nullptr) { - struct ncclComm* comm = groupCommPreconnectHeadMain; - do { - struct ncclPreconnectJob* job; - NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); - job->base.func = ncclPreconnectFunc; - job->base.undo = nullptr; - job->base.destructor = free; - job->base.state = ncclGroupJobRunning; - job->base.abortFlag = comm->abortFlag; - job->comm = comm; - ncclIntruQueueEnqueue(asyncJobsMain, &job->base); - - struct ncclComm* next = comm->preconnectNext; - comm->preconnectNext = reinterpret_cast(0x1); - comm = next; - } while (comm != nullptr); - } if (!ncclIntruQueueEmpty(asyncJobsMain)) { struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain); @@ -321,9 +355,13 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) { assert(state == ncclGroupJobJoined); } - if (__atomic_load_n(groupAbortFlag, __ATOMIC_RELAXED) || errorJobAbortFlag == true) { - __atomic_store_n(job->abortFlag, 1, __ATOMIC_RELAXED); - if (job->childAbortFlag) __atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELAXED); + if (!job->destroyFlag && (__atomic_load_n(groupAbortFlag, __ATOMIC_ACQUIRE) || errorJobAbortFlag == true)) { + __atomic_store_n(job->abortFlag, 1, __ATOMIC_RELEASE); + __atomic_store_n(job->abortFlagDev, 1, __ATOMIC_RELEASE); + if (job->childAbortFlag) { + __atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELEASE); + __atomic_store_n(job->childAbortFlagDev, 1, __ATOMIC_RELEASE); + } } job = job->next; @@ -335,17 +373,86 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) { if (ret != ncclSuccess) goto fail; } - if (groupCommHeadMain != nullptr) { - NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail); - } - while (!ncclIntruQueueEmpty(asyncJobsMain)) { struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain); - if (job->comm && !job->comm->config.blocking) + if (!job->destroyFlag && job->comm && !job->comm->config.blocking) (void) ncclCommSetAsyncError(job->comm, ret); if (job->destructor) job->destructor((void*)job); } +exit: + return ret; +fail: + goto exit; +} + +static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) { + int savedDev; + ncclResult_t ret = ncclSuccess; + struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_; + struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr; + struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr; + struct ncclIntruQueue *asyncJobsMain = gjob->asyncJobsPtr; + bool *groupAbortFlag = gjob->abortFlagPtr; + + CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail); + + if (!simInfo && groupCommPreconnectHeadMain != nullptr) { + struct ncclComm* comm = groupCommPreconnectHeadMain; + do { + struct ncclPreconnectJob* job; + NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); + job->base.func = ncclP2PPreconnectFunc; + job->base.undo = nullptr; + job->base.destructor = free; + job->base.state = ncclGroupJobRunning; + job->base.abortFlag = comm->abortFlag; + job->base.abortFlagDev = comm->abortFlagDev; + job->comm = comm; + ncclIntruQueueEnqueue(asyncJobsMain, &job->base); + + struct ncclComm* next = comm->preconnectNext; + comm->preconnectNext = reinterpret_cast(0x1); + comm = next; + } while (comm != nullptr); + } + + NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail); + + /* Connect channels at runtime if cumem is supported */ + if (groupCommHeadMain != nullptr) { + struct ncclComm* comm = groupCommHeadMain; + + do { + bool needConnect = false; + bool algoNeedConnect[NCCL_NUM_ALGORITHMS]; + memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS); + + NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail); + + if (comm->cuMemSupport && needConnect) { + struct ncclPreconnectJob* job; + NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); + job->base.func = ncclCollPreconnectFunc; + job->base.undo = nullptr; + job->base.destructor = free; + job->base.state = ncclGroupJobRunning; + job->base.abortFlag = comm->abortFlag; + job->comm = comm; + NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail); + memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS); + ncclIntruQueueEnqueue(asyncJobsMain, &job->base); + } + comm = comm->groupNext; + } while (comm); + + NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail); + } + + if ((!simInfo) && (groupCommHeadMain != nullptr)) { + NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail); + } + while (groupCommHeadMain != nullptr) { struct ncclComm* comm = groupCommHeadMain; struct ncclComm* next = comm->groupNext; @@ -365,8 +472,17 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) { goto exit; } -ncclResult_t ncclGroupEndInternal() { +static ncclResult_t groupLaunchNonBlocking(struct ncclAsyncJob *job_) { + return groupLaunch(job_ /* estimatedTime = NULL */); +} + +ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) { ncclResult_t ret = ncclSuccess; + ncclSimInfo_t internalSimInfo = NCCL_SIM_INFO_INITIALIZER; + ncclSimInfo_t* internalSimInfoPtr = NULL; + size_t realSize = 0; + + internalSimInfo.magic = 0; if (ncclGroupDepth == 0) { WARN("ncclGroupEnd: not in a group call."); @@ -378,6 +494,18 @@ ncclResult_t ncclGroupEndInternal() { if ((ret = ncclGroupError) != ncclSuccess) goto fail; + if (simInfo) { + memcpy((void*)&realSize, (void*)&simInfo->size, sizeof(size_t)); + realSize = realSize > sizeof(ncclSimInfo_t) ? sizeof(ncclSimInfo_t) : realSize; + memcpy((void*)&internalSimInfo, (void*)simInfo, realSize); + if (internalSimInfo.magic != 0x74685283) { + WARN("ncclSimInfo_t argument not initialized via NCCL_SIM_INFO_INITIALIZER"); + ret = ncclInvalidArgument; + goto fail; + } + internalSimInfoPtr = &internalSimInfo; + } + if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) { ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead; ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead; @@ -410,12 +538,13 @@ ncclResult_t ncclGroupEndInternal() { } while (comm); } - ncclGroupJobMainPtr->base.func = groupLaunch; + ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking; SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail); ret = ncclInProgress; } else { /* blocking group */ - NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail); + NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail); + if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize); groupResetJobState(ncclGroupJobMainPtr); } } @@ -438,7 +567,7 @@ ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) { ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) { if (groupJob && groupJob->initialized) { - __atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELAXED); + __atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELEASE); NCCLCHECK(ncclGroupJobComplete(groupJob)); } return ncclSuccess; diff --git a/src/include/align.h b/src/include/align.h deleted file mode 100644 index 2a71dd1bc..000000000 --- a/src/include/align.h +++ /dev/null @@ -1,47 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_ALIGN_H_ -#define NCCL_ALIGN_H_ - -#define DIVUP(x, y) \ - (((x)+(y)-1)/(y)) - -#define ROUNDUP(x, y) \ - (DIVUP((x), (y))*(y)) - -#define ALIGN_POWER(x, y) \ - ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x)))) - -#define ALIGN_SIZE(size, align) \ - size = ((size + (align) - 1) / (align)) * (align); - -#if !__CUDA_ARCH__ - #ifndef __host__ - #define __host__ - #endif - #ifndef __device__ - #define __device__ - #endif -#endif - -template -__host__ __device__ constexpr Z divUp(X x, Y y) { - return (x+y-1)/y; -} - -template -__host__ __device__ constexpr Z roundUp(X x, Y y) { - return (x+y-1) - (x+y-1)%y; -} - -// assumes second argument is a power of 2 -template -__host__ __device__ constexpr Z alignUp(X x, int a) { - return (x+a-1) & Z(-a); -} - -#endif diff --git a/src/include/alloc.h b/src/include/alloc.h index aa522ea1a..71d0777cc 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -9,7 +9,7 @@ #include "nccl.h" #include "checks.h" -#include "align.h" +#include "bitops.h" #include "utils.h" #include "p2p.h" #include @@ -19,18 +19,25 @@ uint64_t clockNano(); // from utils.h with which we have a circular dependency +template +constexpr size_t ncclSizeOfT() { return sizeof(T); } +template<> +constexpr size_t ncclSizeOfT() { return 1; } + template ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish); - memset(*ptr, 0, nelem*sizeof(T)); + if (nelem > 0) { + CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*ncclSizeOfT(), cudaHostAllocMapped), result, finish); + memset(*ptr, 0, nelem*ncclSizeOfT()); + } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T)); - INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); + if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", nelem*ncclSizeOfT()); + INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), *ptr); return result; } #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) @@ -42,14 +49,18 @@ inline ncclResult_t ncclCudaHostFree(void* ptr) { template ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { - void* p = malloc(nelem*sizeof(T)); - if (p == NULL) { - WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); - return ncclSystemError; + if (nelem > 0) { + void* p = malloc(nelem*ncclSizeOfT()); + if (p == NULL) { + WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT()); + return ncclSystemError; + } + //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), p); + memset(p, 0, nelem*ncclSizeOfT()); + *ptr = (T*)p; + } else { + *ptr = NULL; } - //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p); - memset(p, 0, nelem*sizeof(T)); - *ptr = (T*)p; return ncclSuccess; } #define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__) @@ -60,16 +71,16 @@ ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { if (nelem == oldNelem) return ncclSuccess; T* oldp = *ptr; - T* p = (T*)malloc(nelem*sizeof(T)); + T* p = (T*)malloc(nelem*ncclSizeOfT()); if (p == NULL) { - WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); + WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT()); return ncclSystemError; } - memcpy(p, oldp, oldNelem*sizeof(T)); + memcpy(p, oldp, oldNelem*ncclSizeOfT()); free(oldp); - memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T)); + memset(p+oldNelem, 0, (nelem-oldNelem)*ncclSizeOfT()); *ptr = (T*)p; - INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr); + INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*ncclSizeOfT(), nelem*ncclSizeOfT(), *ptr); return ncclSuccess; } @@ -111,7 +122,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); if (handlep) *handlep = handle; - TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle); + TRACE(NCCL_ALLOC, "CuMem Alloc Size %zu pointer %p handle %llx", size, *ptr, handle); return result; } @@ -123,7 +134,7 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) { CUCHECK(cuMemRetainAllocationHandle(&handle, ptr)); CUCHECK(cuMemRelease(handle)); CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle); + TRACE(NCCL_ALLOC, "CuMem Free Size %zu pointer %p handle 0x%llx", size, ptr, handle); CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size)); CUCHECK(cuMemRelease(handle)); CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); @@ -151,15 +162,17 @@ ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, in cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (ncclCuMemEnable()) { - NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish); - } else { - CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); + if (nelem > 0) { + if (ncclCuMemEnable()) { + NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT()), result, finish); + } else { + CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT()), result, finish); + } } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T)); - INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); + if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA malloc %ld bytes", nelem*ncclSizeOfT()); + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), *ptr); return result; } #define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__) @@ -170,21 +183,23 @@ ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, in cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - // Need a side stream so as not to interfere with graph capture. - cudaStream_t stream; - CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - if (ncclCuMemEnable()) { - NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish); - } else { - CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); + if (nelem > 0) { + // Need a side stream so as not to interfere with graph capture. + cudaStream_t stream; + CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + if (ncclCuMemEnable()) { + NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT()), result, finish); + } else { + CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT()), result, finish); + } + CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT(), stream), result, finish); + CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); + CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); } - CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); - CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); - CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T)); - INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); + if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc %ld bytes", nelem*ncclSizeOfT()); + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), *ptr); return result; } #define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__) @@ -195,16 +210,18 @@ ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (ncclCuMemEnable()) { - NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish); - } else { - CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); + if (nelem > 0) { + if (ncclCuMemEnable()) { + NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT()), result, finish); + } else { + CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT()), result, finish); + } + CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT(), stream), result, finish); } - CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T)); - INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); + if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc async %ld bytes", nelem*ncclSizeOfT()); + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT(), *ptr); return result; } #define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__) @@ -230,7 +247,7 @@ ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stre ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish); + CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*ncclSizeOfT(), cudaMemcpyDefault, stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); return result; @@ -256,13 +273,17 @@ ncclResult_t ncclCudaFree(T* ptr) { // allocated on separate pages as those pages will be marked DONTFORK // and if they are shared, that could cause a crash in a child process inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) { - size_t page_size = sysconf(_SC_PAGESIZE); - void* p; - int size_aligned = ROUNDUP(size, page_size); - int ret = posix_memalign(&p, page_size, size_aligned); - if (ret != 0) return ncclSystemError; - memset(p, 0, size); - *ptr = p; + if (size > 0) { + size_t page_size = sysconf(_SC_PAGESIZE); + void* p; + int size_aligned = ROUNDUP(size, page_size); + int ret = posix_memalign(&p, page_size, size_aligned); + if (ret != 0) return ncclSystemError; + memset(p, 0, size); + *ptr = p; + } else { + *ptr = NULL; + } INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr); return ncclSuccess; } diff --git a/src/include/bitops.h b/src/include/bitops.h new file mode 100644 index 000000000..95620cbe3 --- /dev/null +++ b/src/include/bitops.h @@ -0,0 +1,277 @@ +/************************************************************************* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_BITOPS_H_ +#define NCCL_BITOPS_H_ + +#include + +#if !__NVCC__ + #ifndef __host__ + #define __host__ + #endif + #ifndef __device__ + #define __device__ + #endif +#endif + +#define DIVUP(x, y) \ + (((x)+(y)-1)/(y)) + +#define ROUNDUP(x, y) \ + (DIVUP((x), (y))*(y)) + +#define ALIGN_POWER(x, y) \ + ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x)))) + +#define ALIGN_SIZE(size, align) \ + size = ((size + (align) - 1) / (align)) * (align); + +template +__host__ __device__ constexpr Z divUp(X x, Y y) { + return (x+y-1)/y; +} + +template +__host__ __device__ constexpr Z roundUp(X x, Y y) { + return (x+y-1) - (x+y-1)%y; +} +template +__host__ __device__ constexpr Z roundDown(X x, Y y) { + return x - x%y; +} + +// assumes second argument is a power of 2 +template +__host__ __device__ constexpr Z alignUp(X x, int a) { + return (x + a-1) & Z(-a); +} +// assumes second argument is a power of 2 +template +__host__ __device__ constexpr Z alignDown(X x, int a) { + return x & Z(-a); +} + +template +inline __host__ __device__ int countOneBits(Int x) { +#if __CUDA_ARCH__ + if (sizeof(Int) <= sizeof(unsigned int)) { + return __popc((unsigned int)x); + } else if (sizeof(Int) <= sizeof(unsigned long long)) { + return __popcll((unsigned long long)x); + } else { + static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size."); + return -1; + } +#else + if (sizeof(Int) <= sizeof(unsigned int)) { + return __builtin_popcount((unsigned int)x); + } else if (sizeof(Int) <= sizeof(unsigned long)) { + return __builtin_popcountl((unsigned long)x); + } else if (sizeof(Int) <= sizeof(unsigned long long)) { + return __builtin_popcountll((unsigned long long)x); + } else { + static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size."); + return -1; + } +#endif +} + +// Returns index of first one bit or returns -1 if mask is zero. +template +inline __host__ __device__ int firstOneBit(Int mask) { + int i; +#if __CUDA_ARCH__ + if (sizeof(Int) <= sizeof(int)) { + i = __ffs((int)mask); + } else if (sizeof(Int) <= sizeof(long long)) { + i = __ffsll((long long)mask); + } else { + static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size."); + } +#else + if (sizeof(Int) <= sizeof(int)) { + i = __builtin_ffs((int)mask); + } else if (sizeof(Int) <= sizeof(long)) { + i = __builtin_ffsl((long)mask); + } else if (sizeof(Int) <= sizeof(long long)) { + i = __builtin_ffsll((long long)mask); + } else { + static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size."); + } +#endif + return i-1; +} + +template +inline __host__ __device__ int popFirstOneBit(Int* mask) { + Int tmp = *mask; + *mask &= *mask-1; + return firstOneBit(tmp); +} + +template +inline __host__ __device__ int log2Down(Int x) { + int w, n; +#if __CUDA_ARCH__ + if (sizeof(Int) <= sizeof(int)) { + w = 8*sizeof(int); + n = __clz((int)x); + } else if (sizeof(Int) <= sizeof(long long)) { + w = 8*sizeof(long long); + n = __clzll((long long)x); + } else { + static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size."); + } +#else + if (x == 0) { + return -1; + } else if (sizeof(Int) <= sizeof(unsigned int)) { + w = 8*sizeof(unsigned int); + n = __builtin_clz((unsigned int)x); + } else if (sizeof(Int) <= sizeof(unsigned long)) { + w = 8*sizeof(unsigned long); + n = __builtin_clzl((unsigned long)x); + } else if (sizeof(Int) <= sizeof(unsigned long long)) { + w = 8*sizeof(unsigned long long); + n = __builtin_clzll((unsigned long long)x); + } else { + static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size."); + } +#endif + return (w-1)-n; +} + +template +inline __host__ __device__ int log2Up(Int x) { + int w, n; + if (x != 0) x -= 1; +#if __CUDA_ARCH__ + if (sizeof(Int) <= sizeof(int)) { + w = 8*sizeof(int); + n = __clz((int)x); + } else if (sizeof(Int) <= sizeof(long long)) { + w = 8*sizeof(long long); + n = __clzll((long long)x); + } else { + static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size."); + } +#else + if (x == 0) { + return 0; + } else if (sizeof(Int) <= sizeof(unsigned int)) { + w = 8*sizeof(unsigned int); + n = __builtin_clz((unsigned int)x); + } else if (sizeof(Int) <= sizeof(unsigned long)) { + w = 8*sizeof(unsigned long); + n = __builtin_clzl((unsigned long)x); + } else if (sizeof(Int) <= sizeof(unsigned long long)) { + w = 8*sizeof(unsigned long long); + n = __builtin_clzll((unsigned long long)x); + } else { + static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size."); + } +#endif + return w-n; +} + +template +inline __host__ __device__ Int pow2Up(Int x) { + return Int(1)< +inline __host__ __device__ Int pow2Down(Int x) { + return Int(1)< +inline __host__ UInt reverseSubBits(UInt x) { + if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) { + switch (8*sizeof(UInt)) { + case 16: x = __builtin_bswap16(x); break; + case 32: x = __builtin_bswap32(x); break; + case 64: x = __builtin_bswap64(x); break; + default: static_assert(8*sizeof(UInt) <= 64, "Unsupported integer type."); + } + return reverseSubBits(x); + } else if (nSubBits == 1) { + return x; + } else { + UInt m = UInt(-1)/((UInt(1)<<(nSubBits/2))+1); + x = (x & m)<<(nSubBits/2) | (x & ~m)>>(nSubBits/2); + return reverseSubBits(x); + } +} + +template struct ncclToUnsigned; +template<> struct ncclToUnsigned { using type = unsigned char; }; +template<> struct ncclToUnsigned { using type = unsigned char; }; +template<> struct ncclToUnsigned { using type = unsigned char; }; +template<> struct ncclToUnsigned { using type = unsigned short; }; +template<> struct ncclToUnsigned { using type = unsigned short; }; +template<> struct ncclToUnsigned { using type = unsigned int; }; +template<> struct ncclToUnsigned { using type = unsigned int; }; +template<> struct ncclToUnsigned { using type = unsigned long; }; +template<> struct ncclToUnsigned { using type = unsigned long; }; +template<> struct ncclToUnsigned { using type = unsigned long long; }; +template<> struct ncclToUnsigned { using type = unsigned long long; }; + +// Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's. +template +inline __host__ __device__ Int reverseBits(Int x, int nBits) { + using UInt = typename ncclToUnsigned::type; + union { UInt ux; Int sx; }; + sx = x; + #if __CUDA_ARCH__ + if (sizeof(Int) <= sizeof(unsigned int)) { + ux = __brev(ux); + } else if (sizeof(Int) <= sizeof(unsigned long long)) { + ux = __brevll(ux); + } else { + static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer type."); + } + #else + ux = reverseSubBits(ux); + #endif + ux = nBits==0 ? 0 : ux>>(8*sizeof(UInt)-nBits); + return sx; +} + +//////////////////////////////////////////////////////////////////////////////// +// Custom 8 bit floating point format for approximating 32 bit uints. This format +// has nearly the full range of uint32_t except it only keeps the top 3 bits +// beneath the leading 1 bit and thus has a max value of 0xf0000000. + +inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) { + int log2x; + #if __CUDA_ARCH__ + log2x = 31-__clz(x|1); + #else + log2x = 31-__builtin_clz(x|1); + #endif + uint32_t mantissa = x>>(log2x >= bitsPerPow2 ? log2x-bitsPerPow2 : 0) & ((1u<= bitsPerPow2 ? log2x-(bitsPerPow2-1) : 0; + return exponent<>bitsPerPow2; + uint32_t mantissa = (x & ((1u< ncclResult_t initChannel(struct ncclComm* comm, int channelid); ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks); -static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) { - int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2; - int peerNode = comm->rankToNode[peer]; - int peerIndex = comm->rankToLocalRank[peer]; - int nsteps = comm->maxLocalRanks; - int rankIndex = comm->rankToLocalRank[comm->rank]; - int step, delta; - if (coll == ncclFuncSend) { - step = (nsteps + peerIndex - rankIndex)%nsteps; - delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes; - } else if (coll == ncclFuncRecv) { - step = (nsteps + rankIndex - peerIndex)%nsteps; - delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes; + +inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) { + if (comm->nNodes > 1) { + int nodeDelta = p2pRound/comm->maxLocalRanks; + int localDelta = p2pRound%comm->maxLocalRanks; + int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH); + base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH; + return base & 0xff; } else { - return ncclInternalError; + return p2pRound & 0xff; } - *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step; - return ncclSuccess; -} - -static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) { - //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels; - *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels; - return ncclSuccess; -} - -static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) { - int base; - NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base)); - NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId)); - return ncclSuccess; } #endif diff --git a/src/include/checks.h b/src/include/checks.h index c9fd16176..89355c3da 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -123,23 +123,23 @@ } while (0); #define NCCLWAIT(call, cond, abortFlagPtr) do { \ - volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ + uint32_t* tmpAbortFlag = (abortFlagPtr); \ ncclResult_t RES = call; \ if (RES != ncclSuccess && RES != ncclInProgress) { \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ return ncclInternalError; \ } \ - if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \ + if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECK(*tmpAbortFlag, 0); \ } while (!(cond)); #define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \ - volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ + uint32_t* tmpAbortFlag = (abortFlagPtr); \ RES = call; \ if (RES != ncclSuccess && RES != ncclInProgress) { \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ goto label; \ } \ - if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \ + if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \ } while (!(cond)); #define NCCLCHECKTHREAD(a, args) do { \ diff --git a/src/include/collectives.h b/src/include/collectives.h index 888df728f..fb7af3bff 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -8,6 +8,8 @@ #define NCCL_COLLECTIVES_H_ #include "nccl.h" +#include "nccl_common.h" +#include "device.h" // CHUNKSIZE must be a multiple of SLICESIZE #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) @@ -22,6 +24,12 @@ #define REDUCE_CHUNKSTEPS 1 #define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above +const char* ncclFuncToString(ncclFunc_t op); +const char* ncclDevRedOpToString(ncclDevRedOp_t op); +const char* ncclDatatypeToString(ncclDataType_t type); +const char* ncclAlgoToString(int algo); +const char* ncclProtoToString(int proto); + inline int ncclTypeSize(ncclDataType_t type) { switch (type) { case ncclInt8: diff --git a/src/include/comm.h b/src/include/comm.h index 0ba913ada..0cc0a8911 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -7,7 +7,7 @@ #ifndef NCCL_COMM_H_ #define NCCL_COMM_H_ -#include "transport.h" +//#include "transport.h" #include "p2p.h" #include "collectives.h" #include "nccl_tuner.h" @@ -15,6 +15,7 @@ #include "strongstream.h" #include "nccl_net.h" #include "register.h" +#include "graph.h" #if CUDART_VERSION < 9000 struct cudaLaunchParams { @@ -144,7 +145,7 @@ struct ncclChannel { struct ncclNvls nvls; int id; // index of this channel - uint32_t workFifoSent; // last used work index+1 + uint32_t workFifoProduced; // +1 successor of last used work fifo byte /* comm split sharable resources */ struct ncclChannelPeer* collnetPeers; @@ -153,22 +154,15 @@ struct ncclChannel { struct ncclDevChannelPeer* nvlsDevPeers; }; -struct ncclWorkList { - struct ncclWorkList* next; - struct ncclWork work; -}; - -struct ncclPointerList { - struct ncclPointerList* next; - void *ptr; +struct ncclWorkBatchList { + struct ncclWorkBatchList* next; + struct ncclDevWorkBatch batch; }; - -struct ncclNvlsMcHandleList { - struct ncclNvlsMcHandleList *next; - CUmemGenericAllocationHandle mcHandle; - CUdeviceptr ptr; - int dev; - size_t size; +struct alignas(16) ncclWorkList { + struct ncclWorkList* next; + enum ncclDevWorkType workType; + int size; // Size of struct following this node + // ncclDevWorkColl, ncclDevWorkColLReg, ncclDevWorkP2p[]... }; struct ncclCollnetHandleList { @@ -188,33 +182,190 @@ struct ncclKernelPlan { struct ncclKernelPlan* next; bool persistent; // aka captured in a graph + enum ncclDevWorkStorageType workStorageType; bool kernelSpecialized; void *kernelFn; - int channelUbound; // only channels c < channelUbound are present - int channelCount; // number of channels present - uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask) + struct ncclDevKernelArgs* kernelArgs; + size_t kernelArgsSize; + uint64_t channelMask; // bitset of which channels are present bool hasProxyOps; // does any channel have a non-empty proxyOpQueue int threadPerBlock; - // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel() - struct ncclWork* workHead; - int collOpCount; // zero based for this plan + int collOpCount; // Number of collectives in this plan. + int nWorkBatches; // Number of work batches. + size_t workBytes; // Sum size of all work (in the fifo) in bytes. + struct ncclIntruQueue workQueue; + struct ncclIntruQueue cleanupQueue; + void* workBufPersistent; - struct ncclIntruQueue ipcMemQueue; - struct ncclIntruQueue nvlsMcHandleQueue; - struct ncclIntruQueue collnetHandleQueue; + struct ncclIntruQueue proxyOpQueue; +}; - struct Channel { - int nWork; - union { - int nWorkElem; // used for coll and reg coll - int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1 - }; - size_t collBytes; - struct ncclIntruQueue workQueue; - struct ncclIntruQueue proxyOpQueue; - } channels[MAXCHANNELS]; - size_t maxBytesPerChannel; +//////////////////////////////////////////////////////////////////////////////// + +struct ncclTaskColl { + struct ncclTaskColl* next; + ncclFunc_t func; + void const* sendbuff; + void* recvbuff; + size_t count; + int root; + ncclDataType_t datatype; + ncclRedOp_t opHost; + struct ncclDevRedOpFull opDev; + int chunkSteps, sliceSteps; + // Computed later: + size_t trafficBytes; + int32_t nMaxChannels:8; + int32_t nWarps:8; + int32_t algorithm:8, protocol:8; + uint32_t isCollnet:1, isNvls:1; + uint32_t devFuncId:30; + enum ncclRegBufferType regBufType; + // number of elements in planner->ipcMemQueue associated with this collective + int nCleanupQueueElts; + + void* sendMhandle; + void* recvMhandle; +}; +struct ncclTaskP2p { + struct ncclTaskP2p* next; + void* buff; + size_t bytes; +}; + +//////////////////////////////////////////////////////////////////////////////// +// Roughly sorts ncclTaskColl's by their size descending. This structure is +// self-referential, meaning that pointers it contains internally may point +// into the structure itself. This means that it is NOT memcpy-moveable: + +struct ncclTaskCollSorter { + static constexpr int UnitLog2 = 10; // 1K + static constexpr size_t UnitSize = 1<>UnitLog2, BitsPerPow2); + bin = BinCount-1 - bin; // descending bin + + if (me->bins[bin] == nullptr) { + if (me->binEdge <= bin) { + me->binEdge = bin+1; + me->bins[bin] = me->tail ? &me->tail->next : &me->head; + me->tail = x; + } else { + // Find successor non-empty bin after this one. + int succ = bin+1; + while (me->bins[succ] == nullptr) succ++; + // What was our successor's head's previous is now our head's previous. + me->bins[bin] = me->bins[succ]; + // The first node we insert is our tail, so that becomes our successor's + // head's new previous. + me->bins[succ] = &x->next; + } + } + // Push a new head for this bin. + x->next = *me->bins[bin]; + *me->bins[bin] = x; +} + +inline bool ncclTaskCollSorterEmpty(struct ncclTaskCollSorter* me) { + return me->head == nullptr; +} + +// Reset sorter and return sorted linked list of its coll tasks. +inline struct ncclTaskColl* ncclTaskCollSorterDequeueAll(struct ncclTaskCollSorter* me) { + struct ncclTaskColl* head = me->head; + if (head != nullptr) memset(me, 0, sizeof(*me)); + return head; +} + +//////////////////////////////////////////////////////////////////////////////// + +struct ncclCudaStreamList { + struct ncclCudaStreamList *next; + cudaStream_t stream; +}; + +struct ncclKernelPlanner { + ////////////////////////////////////////////////////////////////////////////// + // State for accumulating tasks between ncclGroupStart/End() + ////////////////////////////////////////////////////////////////////////////// + + struct Peer { + bool sendSeen, recvSeen; + struct ncclIntruQueue sendQueue; + struct ncclIntruQueue recvQueue; + }; + struct ncclTaskCollSorter collSorter; + struct Peer* peers/*[nRanks]*/; + int nTasksColl, nTasksP2p; + bool persistent; + + // The list of user streams aggregated over all tasks present. + struct ncclCudaStreamList* streams; + // The most recent user stream. Ignored if streams==nullptr + cudaStream_t streamRecent; + // The graph capturing all user streams or invalid if none. Thus we restrict the + // user that all streams must be captured in the same graph or not captured + // at all. Technically we could probably relax this, but that would mean + // collecting a different `ncclTasks` per graph and one for non-graph. + struct ncclCudaGraph capturingGraph; + + ////////////////////////////////////////////////////////////////////////////// + // Lists of tasks to be assembled into plans. + ////////////////////////////////////////////////////////////////////////////// + + struct ncclIntruQueue collTaskQueue; + struct ncclIntruQueue collWorkQueue; + struct ncclIntruQueue collCleanupQueue; + + ////////////////////////////////////////////////////////////////////////////// + // State for building current (Work-In-Progress) plan: + ////////////////////////////////////////////////////////////////////////////// + + struct WipPlan { + struct Channel { + struct { + int workBytes; // Sum size of work metadata referenced by this batch. + int nP2ps; // Number of p2p works in this batch + int p2pRounds[NCCL_MAX_DEV_WORK_P2P_PER_BATCH]; // which rounds are present in this batch. + } wipBatch; // work-in-progress batch which will be next tail of workBatchQueue + int nWorkBatchesP2p; // number of p2p batches for this channel. + struct ncclIntruQueue workBatchQueue; + struct ncclIntruQueue proxyOpQueue; + } channels[MAXCHANNELS]; + } wipPlan; + + ////////////////////////////////////////////////////////////////////////////// + // State for launching built plans: + ////////////////////////////////////////////////////////////////////////////// + + // List of kernel plans built form tasks. + struct ncclIntruQueue planQueue; + // First of the unlaunched kernels in `planQueue` + struct ncclKernelPlan* unlaunchedPlansHead; }; #define NCCL_MAGIC 0x0280028002800280 // Nickel atomic number is 28. @@ -233,12 +384,18 @@ struct ncclComm { struct ncclPeerInfo* peerInfo; struct ncclTopoSystem* topo; + int netPluginLoaded; ncclNet_t* ncclNet; + ncclNetDeviceType netDeviceType; ncclCollNet_t* ncclCollNet; void* bootstrap; // Bitmasks for ncclTransportP2pSetup uint64_t* connectSend; uint64_t* connectRecv; + struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS]; + bool initAlgoChannels[NCCL_NUM_ALGORITHMS]; + bool runtimeConn; // if dynamic connection is supported + int cuMemSupport; uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. @@ -253,6 +410,9 @@ struct ncclComm { cpu_set_t cpuAffinity; // CPU affinity of the GPU int cudaArch; // matches __CUDA_ARCH__ of device + int cpuArch; // architecture - As defined in src/include/graph.h, e.g. x86/arm/ppc/mixed + int cpuVendor; // vendor - As defined in src/include/graph.h + int node; int nNodes; int localRank; @@ -278,10 +438,11 @@ struct ncclComm { int nChannels; // connection nChannels int collChannels; // enqueue nChannels int nvlsChannels; // enqueue nChannels + // all nvls heads stored to check if we can splitShare + int nvlsHeads[MAXCHANNELS]; // Channels (per peer) for p2p int p2pnChannels; int p2pnChannelsPerPeer; - int p2pChannels[MAXCHANNELS]; // Should this comm allocate LL buffers for network P2P connections? bool allocP2pNetLLBuffers; @@ -303,23 +464,28 @@ struct ncclComm { ncclResult_t asyncResult; // Flag to ask NCCL kernels to abort - volatile uint32_t *abortFlag; - volatile uint32_t *childAbortFlag; - uint32_t *abortFlagRefCount; + uint32_t* abortFlag; + uint32_t* abortFlagDev; + int* abortFlagRefCount; + uint32_t* childAbortFlag; + uint32_t* childAbortFlagDev; + uint32_t destroyFlag; // Device side of the communicator (for cudaFree's) struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm - // Operation pool. - int workFifoDepth; // size of workFifoHeap[], power of 2 - struct ncclWork* workFifoHeap; - struct ncclWork* devWorkFifoHeap; - void* workFifoHeapGdrHandle; + uint32_t workArgsBytes; // max size of kernel args + uint32_t workFifoBytes; // size of workFifoBuf, power of 2 + void* workFifoBuf; + void* workFifoBufDev; + void* workFifoBufGdrHandle; - // Work completion notificaion - uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory - uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot. - uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels. + // Monotonic number of bytes (mod 1<<32) consumed per channel. In cudaHost memory. + uint32_t* workFifoConsumed/*[MAXCHANNELS]*/; + // Last observed value of: min(workFifoConsumed[c] for c < MAXCHANNELS) + uint32_t workFifoConsumedLeast; + // Monotonic number of bytes (mod 1<<32) sent to fifo. + uint32_t workFifoProduced; // Intra-process sync struct ncclComm* intraComm0; // leader of intra-process comms (self possible) @@ -337,7 +503,7 @@ struct ncclComm { // Whether this communicator uses collNet int collNetSupport; bool collNetRegSupport; - uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes]; + uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes]; int intraHighestTransportType; int* collNetHeads; int collNetHeadsNum; @@ -355,16 +521,16 @@ struct ncclComm { // pools backed by comm->memPermanent struct ncclMemoryPool memPool_ncclProxyOp; struct ncclMemoryPool memPool_ncclKernelPlan; - struct ncclMemoryPool memPool_ncclPointerList; - struct ncclMemoryPool memPool_ncclNvlsHandleList; - struct ncclMemoryPool memPool_ncclCollnetHandleList; + // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when // this comm is not yet in a group. struct ncclComm* groupNext; // Subset of those in groupNext list. Holds 0x1 if not needing preconnect. struct ncclComm* preconnectNext; int persistentRefs; // number of persistent plan-lists capturing this comm - struct ncclTasks tasks; + struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule; + + struct ncclKernelPlanner planner; // user-created reduction ops int userRedOpCapacity, userRedOpFreeHead; @@ -373,11 +539,6 @@ struct ncclComm { // Queue of things for the main thread to do struct ncclIntruQueueMpsc callbackQueue; - // List of kernel plans built form tasks. - struct ncclIntruQueue planQueue; - // First of the unlaunched kernels in `planQueue` - struct ncclKernelPlan* unlaunchedPlansHead; - ncclConfig_t config; // initState is to more conveniently reclaim resources when errors happen. ncclResult_t initState; @@ -389,6 +550,7 @@ struct ncclComm { struct ncclGroupJob *groupJob; // Tuning plugin + int tunerPluginLoaded; ncclTuner_t* tuner; void *tunerContext; // buffer registration cache diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h index b4eb5c312..fd7b0310e 100644 --- a/src/include/cudawrap.h +++ b/src/include/cudawrap.h @@ -80,6 +80,10 @@ DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent); DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent); DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice); DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute); +DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel); +#if CUDART_VERSION >= 11080 +DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx); +#endif // cuMem API support DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve); DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree); diff --git a/src/include/debug.h b/src/include/debug.h index eb5189058..491ac3e12 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -10,21 +10,14 @@ #include "nccl.h" #include "nccl_common.h" #include -#include -#include -#include -#include #include // Conform to pthread and NVTX standard #define NCCL_THREAD_NAMELEN 16 extern int ncclDebugLevel; -extern uint64_t ncclDebugMask; -extern pthread_mutex_t ncclDebugLock; extern FILE *ncclDebugFile; -extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); @@ -32,13 +25,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file extern thread_local int ncclDebugNoWarn; extern char ncclLastError[]; +#define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__) #ifdef ENABLE_TRACE #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) -extern std::chrono::steady_clock::time_point ncclEpoch; #else #define TRACE(...) #endif diff --git a/src/include/device.h b/src/include/device.h index 50f841bfc..76a909f7a 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -9,8 +9,10 @@ #include "nccl.h" #include "nccl_common.h" -#include "align.h" +#include "bitops.h" +#include #include +#include extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS]; @@ -21,6 +23,12 @@ extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS]; #define NCCL_MAX_OPS 2048 #define NCCL_STEPS 8 +#ifdef __CUDA_ARCH__ + #define NCCL_CUDA_ARCH __CUDA_ARCH__ +#else + #define NCCL_CUDA_ARCH 0 +#endif + #include "net_device.h" enum ncclDevRedOp_t { @@ -52,8 +60,11 @@ union ncclLLFifoLine { #define WARP_SIZE 32 #define MAXCHANNELS 32 +#define NCCL_MAX_LOCAL_RANKS 64 #define NCCL_MAX_NTHREADS 640 +#define NCCL_MIN_NTHREADS (4*WARP_SIZE) #define NCCL_SIMPLE_MAX_NTHREADS 512 +#define NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE (3*WARP_SIZE) #define NCCL_LL_MAX_NTHREADS 512 #define NCCL_LL_LINES_PER_THREAD 8 #ifdef TEST_LL_CLEANUP @@ -84,6 +95,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK #define NCCL_IPC_READ 0x10 #define NCCL_NVLS_MIN_POLL 0x20 +// Number of named barriers supported by CUDA +#define NCCL_MAX_GROUPS 16 + #define NCCL_MAX_COLLNET_SIZE (1L << 29) enum ncclRegBufferType { @@ -196,112 +210,155 @@ struct ncclChannelPeer { struct ncclDevComm; -/* ncclWork is to be a power of two, currently 8x64 bytes, */ -/* to make sure reads to host from the CUDA kernel are aligned. */ -/* Make sure to adjust padding at the end of ncclWorkElem. */ -#define NCCL_WORK_SIZE 512 - -enum ncclWorkType : uint8_t { - ncclWorkTypeUnused=0, - ncclWorkTypeColl=1, - ncclWorkTypeP2p=2, - ncclWorkTypeRegColl=3 -}; -enum ncclWorkP2PType : uint8_t { - ncclWorkP2pTypeUnused=0, - ncclWorkP2pTypeSend, - ncclWorkP2pTypeRecv +struct alignas(16) ncclDevWorkP2p { + void *sendAddr, *recvAddr; + size_t sendBytes, recvBytes; + int sendRank, recvRank; + // From the part index, nP2pChannels, and channelBase the device code can + // calculate which part of the transfer a channel is responsible for. + uint8_t nP2pChannels; // Always equal to comm->p2pnChannels + uint8_t channelBase; // Channel owning first part. + // Zero channels indicates no work in that direction. + uint8_t nSendChannels, nRecvChannels; + // Chunk size stored in 8 bits via u32fp8Encode/Decode. + uint8_t sendChunkSize_u32fp8, recvChunkSize_u32fp8; + + uint8_t sendProtoLL:1, recvProtoLL:1; + uint8_t sendRegistered:1, recvRegistered:1; }; -struct ncclWorkHeader { - union { - int32_t workNext; // when isLast=0: Offset from kernel argument workHead - uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back. - }; - uint16_t funcIndex; - uint8_t isLast:1; // last work for this kernel - uint8_t inFifo:1; // is this work in the fifo - enum ncclWorkType type; -}; +// Compute the subset of the data transfer corresponding to the given part index. +inline __host__ __device__ void ncclP2pPartBounds(int nParts, int part, size_t bytes, size_t* partBeg, size_t* partEnd) { + size_t partBytes = alignUp(divUp(bytes, nParts), 4<<10); + #if __CUDA_ARCH__ + *partBeg = min((part+0)*partBytes, bytes); + *partEnd = min((part+1)*partBytes, bytes); + #else + *partBeg = std::min((part+0)*partBytes, bytes); + *partEnd = std::min((part+1)*partBytes, bytes); + #endif +} -struct ncclWorkElem { - union { - uint8_t flagBits; - struct { - uint8_t isUsed:1, redOpArgIsPtr:1, oneNode:1; - }; - }; - uint8_t regUsed; - uint8_t nWarps; - uint8_t direct; - uint32_t root; - const void *sendbuff; - void *recvbuff; +// implemented in channel.h +inline __host__ uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound); - size_t count; - uint64_t redOpArg; - uint64_t chunkCount:25, workCount:39; +// ncclP2pChannelToPart and ncclP2pChannelForPart are inverses. The device code +// uses ncclP2pChannelToPart to determine which part "this" channel is responsible for. +inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, int part) { + // Only works because nP2pChannels is pow2 + int nChannelsLog2 = countOneBits(nP2pChannels-1); + int delta = reverseBits(part, nChannelsLog2); + return (base + delta) & (nP2pChannels-1); +} +inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int channel) { + // Only works because nP2pChannels is pow2 + int nChannelsLog2 = countOneBits(nP2pChannels-1); + int delta = (channel-base) & (nP2pChannels-1); + return reverseBits(delta, nChannelsLog2); +} + +struct alignas(16) ncclDevWorkColl { + // Running on channels [channelLo..channelHi], hi is inclusive. + // nChannels == (channelHi - channelLo) + 1 + uint32_t channelLo:8, channelHi:8; + uint32_t nWarps:8; + uint32_t redOpArgIsPtr:1, regUsed:2, oneNode:1, direct:4; + uint32_t root; + void* recvbuff; + void* sendbuff; union { + // Continuous-byte-distribution scheduling. The lo and hi channels are of + // different size than the channels in the middle. struct { - uint64_t lastChunkCount:25; - uint64_t workOffset:39; - }; + size_t countLo, countMid, countHi; + // Chunk counts where units are ncclProtoGrainSize(protocol) bytes + uint64_t chunkGrainsLo:21, chunkGrainsMid:21, chunkGrainsHi:21; + } cbd; + // Collnet scheduling. All channels divide work evenly. struct { - uint64_t bid:32; - uint64_t nChannels:32; - }; + size_t count; // Total size, not divided per channel. + uint32_t chunkCount; + } collnet; }; + uint64_t redOpArg; }; -#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem)) -static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9"); - -struct ncclWorkElemP2p { - int peer : 30; - int proto : 2; - - enum ncclWorkP2PType p2pType; - uint8_t reg:1; - uint8_t nWarps:5; - uint8_t warpStart; - uint8_t ngroups; - // Important not to use any fields with greater than 4-byte alignment since - // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if - // there were 8-byte fields. - //void* buff; - uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32; - //size_t count; - uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32; - int chunkSize; -}; -static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16"); -#define NCCL_MAX_WORK_ELEMENTS_P2P 16 +__host__ __device__ constexpr int ncclProtoGrainSize(int proto) { + return proto == NCCL_PROTO_LL ? 16 : + proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) : + proto == NCCL_PROTO_SIMPLE ? 512 : + -1; +} + +template +__host__ __device__ inline void ncclCollCbdPart( + struct ncclDevWorkColl* work, uint32_t channelId, int proto, int eltSize, + Int* count, Int* partOffset, Int* partCount, Int* chunkCount + ) { + int eltPerGrain = ncclProtoGrainSize(proto)/eltSize; + int nMidChannels = work->channelHi - work->channelLo - 1; + // We can assum that nMidChannels<0 implies countMid==0, which let's us assume + // that countMid*nMidChannels == 0. + if (count != nullptr) { + *count = work->cbd.countLo + work->cbd.countMid*nMidChannels + work->cbd.countHi; + } + if (channelId == work->channelLo) { + *partOffset = 0; + *partCount = work->cbd.countLo; + *chunkCount = work->cbd.chunkGrainsLo*eltPerGrain; + } else if (channelId == work->channelHi) { + *partOffset = work->cbd.countLo + nMidChannels*work->cbd.countMid; + *partCount = work->cbd.countHi; + *chunkCount = work->cbd.chunkGrainsHi*eltPerGrain; + } else { + int mid = channelId - work->channelLo - 1; + *partOffset = work->cbd.countLo + mid*work->cbd.countMid; + *partCount = work->cbd.countMid; + *chunkCount = work->cbd.chunkGrainsMid*eltPerGrain; + } +} -struct ncclWorkElemReg { - struct ncclWorkElem elem; +struct alignas(16) ncclDevWorkCollReg { + struct ncclDevWorkColl coll; void* dnInputs[NCCL_MAX_DIRECT_ARITY+1]; void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1]; void* upOutputs[NCCL_MAX_DIRECT_ARITY+1]; }; -#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg)) -static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2"); +enum ncclDevWorkType: uint8_t { + ncclDevWorkTypeP2p, + ncclDevWorkTypeColl, + ncclDevWorkTypeCollReg +}; -// Number of named barriers supported by CUDA -#define NCCL_MAX_GROUPS 16 +constexpr size_t ncclDevWorkSize(enum ncclDevWorkType type) { + return type == ncclDevWorkTypeP2p ? sizeof(ncclDevWorkP2p) : + type == ncclDevWorkTypeColl ? sizeof(ncclDevWorkColl) : sizeof(ncclDevWorkCollReg); +} -struct ncclWork { - struct ncclWorkHeader header; +#define NCCL_MAX_DEV_WORK_BATCH_BYTES 1024 +#define NCCL_MAX_DEV_WORK_BATCH_COLLS (NCCL_MAX_DEV_WORK_BATCH_BYTES/sizeof(ncclDevWorkColl)) +#define NCCL_MAX_DEV_WORK_P2P_PER_BATCH 8 +struct alignas(16) ncclDevWorkBatch { union { - char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)]; - struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS]; - struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P]; - struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG]; + struct { + // nextExtends: should next one be merged into this one. + // nextJump=0: end of this channel's batch list + // nextJump>0: batches[thisIndex+nextJump] is next batch in this list + uint32_t nextJump:14, nextExtends:1; + uint32_t workType:2, funcId:15; + }; + // Unioning bitfields with underlying type hints compiler to emit the best + // SASS LD/ST accesses. + uint32_t flags; }; + // Rolling offset in fifo where this batch's work structs begin + uint32_t offsetBase; + // Set of relative offsets from offsetBase for this channel's subset of the batch: + // For each bit index i in offsetMask, find work at fifo offset: offsetBase + i*sizeof(WorkStructType) + uint64_t offsetBitset; }; -static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE"); -static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0"); struct ncclDevChannelPeer { // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo @@ -328,9 +385,8 @@ struct ncclDevComm { int buffSizes[NCCL_NUM_PROTOCOLS]; int p2pChunkSize; - // Operation list for aggregation - int workFifoDepth; - struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory + // Work fifo return credits + uint32_t* workConsumed/*[MAXCHANNELS]*/; int* collNetDenseToUserRank; @@ -346,11 +402,37 @@ struct alignas(16) ncclDevCommAndChannels { struct ncclDevChannel channels[MAXCHANNELS]; }; -#ifdef __CUDA_ARCH__ - #define NCCL_CUDA_ARCH __CUDA_ARCH__ -#else - #define NCCL_CUDA_ARCH 0 -#endif +enum ncclDevWorkStorageType: uint8_t { + ncclDevWorkStorageTypeArgs=0, + ncclDevWorkStorageTypeFifo=1, + ncclDevWorkStorageTypePersistent=2 +}; + +struct alignas(16) ncclDevKernelArgs { + struct ncclDevComm* comm; + uint64_t channelMask; + enum ncclDevWorkStorageType workStorageType; + uint32_t workMask; + void* workBuf; + // A channel's first batch is at `blockIdx.x`. Use `nextJump` to follow rest of list. + // struct ncclDevWorkBatch batches[]; +}; + +__host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) { + //return (cudaArch < 700 || cudaDriver < 12010) ? 4<<10 : (32<<10)-4; + return 4<<10; +} + +template +struct alignas(16) ncclDevKernelArgsStorage { + union { + struct ncclDevKernelArgs args; + ulong2 storage[capacity/sizeof(ulong2)]; + }; +}; + +typedef ncclDevKernelArgsStorage<(4<<10)> ncclDevKernelArgs4K; +//typedef ncclDevKernelArgsStorage<(32<<10)-4> ncclDevKernelArgs31K; template __host__ __device__ constexpr T min_constexpr(T a) { return a; } @@ -366,6 +448,10 @@ __host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) { return max_constexpr((a > b ? a : b), c...); } +constexpr int ncclDevMaxChannelsForArgsBytes(size_t argsBytes) { + return min_constexpr(MAXCHANNELS, (argsBytes - sizeof(struct ncclDevKernelArgs))/sizeof(struct ncclDevWorkBatch)); +} + // Calculate the unroll factor given: // * bytePerPack: number of bytes accessed per instruction // * insns: max permissible unroll value @@ -412,6 +498,7 @@ extern int const ncclDevKernelCount; extern void* const ncclDevKernelList[/*ncclDevKernelCount*/]; // Table of most specialized kernel function to run given func index. +extern int const ncclDevFuncIdCount; extern int const ncclDevFuncRowToId[]; extern void* const ncclDevKernelForFunc[/*funcIndex*/]; extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/]; diff --git a/src/include/enqueue.h b/src/include/enqueue.h index 8ab59607d..1bb5a604f 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -24,5 +24,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan); ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); ncclResult_t ncclLaunchFinish(struct ncclComm* comm); +ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo); #endif // End include guard diff --git a/src/include/gdrwrap.h b/src/include/gdrwrap.h index a64674cc5..705f866ea 100644 --- a/src/include/gdrwrap.h +++ b/src/include/gdrwrap.h @@ -8,6 +8,7 @@ #define NCCL_GDRWRAP_H_ #include "nccl.h" +#include "alloc.h" #include // for standard [u]intX_t types #include #include @@ -194,7 +195,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** char *devMem; void *gdrMap; - mapSize = sizeof(T)*nelem; + mapSize = ncclSizeOfT()*nelem; // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE ALIGN_SIZE(mapSize, GPU_PAGE_SIZE); @@ -203,7 +204,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK; size_t align = alignedAddr - (uint64_t)devMem; - //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize); + //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zu size %zu", alignedAddr, devMem, align, mapSize); NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh)); NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize)); @@ -226,7 +227,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** *ptr = (T *)((char *)gdrMap+off); if (devPtr) *devPtr = (T *)(devMem+off+align); - TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p", + TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zu at %p", md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr); return ncclSuccess; @@ -235,7 +236,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** template static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) { gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; - NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T))); + NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*ncclSizeOfT())); return ncclSuccess; } diff --git a/src/include/graph.h b/src/include/graph.h index 08cfba4fd..0271b52d1 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -29,6 +29,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks); int ncclTopoPathAllNVLink(struct ncclTopoSystem* system); +ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm); // Query topology ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank); @@ -46,9 +47,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu #define NCCL_TOPO_CPU_ARCH_X86 1 #define NCCL_TOPO_CPU_ARCH_POWER 2 #define NCCL_TOPO_CPU_ARCH_ARM 3 +#define NCCL_TOPO_CPU_ARCH_MIXED 4 #define NCCL_TOPO_CPU_VENDOR_INTEL 1 #define NCCL_TOPO_CPU_VENDOR_AMD 2 #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3 +#define NCCL_TOPO_CPU_VENDOR_MIXED 4 #define NCCL_TOPO_CPU_TYPE_BDW 1 #define NCCL_TOPO_CPU_TYPE_SKL 2 #define NCCL_TOPO_CPU_TYPE_YONGFENG 1 @@ -70,6 +73,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system); #define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU #define NCCL_TOPO_PATTERN_RING 4 // Ring #define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree +#define NCCL_TOPO_PATTERN_COLLNET_DIRECT 6 // Collnet Direct struct ncclTopoGraph { // Input / output int id; // ring : 0, tree : 1, collnet : 2 @@ -113,7 +117,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent); ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs); -#include "info.h" -ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL); +ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup=nullptr); #endif diff --git a/src/include/group.h b/src/include/group.h index 72251147f..91bc19068 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -35,9 +35,12 @@ struct ncclAsyncJob { void(*undo)(struct ncclAsyncJob*); void(*destructor)(void*); ncclGroupJobState_t state; - volatile uint32_t *abortFlag; /* point to comm abortFlag */ - volatile uint32_t *childAbortFlag; /* point to child abortFlag */ + uint32_t* abortFlag; /* point to comm abortFlag */ + uint32_t* abortFlagDev; /* point to comm abortFlagDev */ + uint32_t* childAbortFlag; /* point to child abortFlag */ + uint32_t* childAbortFlagDev; /* point to child abortFlagDev */ ncclComm_t comm; + int destroyFlag; }; ncclResult_t ncclAsyncLaunch( @@ -52,14 +55,14 @@ struct ncclGroupJob { struct ncclComm **groupCommHeadPtr; struct ncclComm **groupCommPreconnectHeadPtr; ncclResult_t *groupErrorPtr; - volatile bool *abortFlagPtr; + bool *abortFlagPtr; int *groupBlockingPtr; struct ncclIntruQueue *asyncJobsPtr; bool initialized; }; ncclResult_t ncclGroupStartInternal(); -ncclResult_t ncclGroupEndInternal(); +ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo = NULL); ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job); //////////////////////////////////////////////////////////////////////////////// @@ -114,6 +117,10 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) { // Comms gets a new memory stack scope upon joining. Each task batched for // this comm is allocated there. ncclMemoryStackPush(&comm->memScoped); + // Initialize planner + ncclKernelPlanner::Peer* tmp = comm->planner.peers; + memset(&comm->planner, 0, sizeof(comm->planner)); + comm->planner.peers = tmp; } ncclGroupBlocking = comm->config.blocking; diff --git a/src/include/info.h b/src/include/info.h index 0d53b9a21..3cabae866 100644 --- a/src/include/info.h +++ b/src/include/info.h @@ -8,28 +8,9 @@ #define NCCL_INFO_H_ #include "nccl.h" -#include "device.h" #include "collectives.h" #include "core.h" #include "utils.h" -#include "strongstream.h" -#define NCCL_MAX_LOCAL_RANKS 64 - -typedef enum : uint8_t { - ncclPatternRing, - ncclPatternRingTwice, - ncclPatternPipelineFrom, - ncclPatternPipelineTo, - ncclPatternTreeUp, - ncclPatternTreeDown, - ncclPatternTreeUpDown, - ncclPatternCollnetChain, - ncclPatternCollnetDirect, - ncclPatternNvls, - ncclPatternNvlsTree, - ncclPatternSend, - ncclPatternRecv -} ncclPattern_t; // Used to pass NCCL call information between functions struct ncclInfo { @@ -47,110 +28,6 @@ struct ncclInfo { // Algorithm details int chunkSteps; int sliceSteps; - // Computed later - ncclDevRedOpFull opFull; - ncclPattern_t pattern; - size_t nBytes; - size_t aggnBytes; - size_t workBytes; - size_t sendbuffSize; - size_t recvbuffSize; - int stepSize; - int chunkCount; - int chunkSize; - int channelId; - int workFuncIndex; - ncclRegBufferType regBufType; - void* regBufSend[NCCL_MAX_LOCAL_RANKS]; - void* regBufRecv[NCCL_MAX_LOCAL_RANKS]; - // collnet buffer reg handles - void* sendMhandle; - void* recvMhandle; - // Need to initialize - int nThreads; - int nChannels; - int algorithm; - int protocol; - bool userTuned; - struct ncclInfo *next; -}; - -inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) { - info->nBytes = info->workBytes = info->count * ncclTypeSize(info->datatype); - if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) { - info->count = info->workBytes; - info->datatype = ncclInt8; - } - if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank - - /* compute buffer size for NVLS buffer registration */ - if (info->coll == ncclFuncAllGather) { - info->sendbuffSize = info->workBytes; - info->recvbuffSize = info->sendbuffSize * nRanks; - } else if (info->coll == ncclFuncReduceScatter) { - info->recvbuffSize = info->workBytes; - info->sendbuffSize = info->recvbuffSize * nRanks; - } else { - info->sendbuffSize = info->recvbuffSize = info->workBytes; - } - return ncclSuccess; -} - -struct ncclTaskColl { - struct ncclTaskColl* next; - ncclFunc_t func; - void const* sendbuff; - void* recvbuff; - size_t count; - int root; - ncclDataType_t datatype; - ncclDevRedOpFull op; - int chunkSteps, sliceSteps; - struct ncclInfo info; -}; -struct ncclTaskP2p { - ncclTaskP2p *next; - void *buff; - size_t bytes; - // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track - // of where it left off. - int chunk; -}; - -struct ncclCudaStreamList { - struct ncclCudaStreamList *next; - cudaStream_t stream; -}; -struct ncclTasks { - struct Peer { - bool sendSeen, recvSeen; - struct ncclIntruQueue sendQueue; - struct ncclIntruQueue recvQueue; - }; - struct ncclIntruQueue collQueue; - // Queue for user-tuned executed collectives - struct ncclIntruQueue collTunedQueue; - // Queue for continuous bytes distribution (CBD) collectives - struct ncclIntruQueue collCBDQueue; - // Queue for collnet - struct ncclIntruQueue collnetQueue; - size_t workBytesTotal; - int usableChannels; - bool sorted; - struct Peer* peers/*[nRanks]*/; - int *p2pSendOrder, *p2pRecvOrder; - int p2pOrderSteps; - int nTasksColl, nTasksP2p; - - // The list of user streams aggregated over all tasks present. - struct ncclCudaStreamList* streams; - // The most recent user stream. Ignored if streams==nullptr - cudaStream_t streamRecent; - // The graph capturing all user streams or invalid if none. Thus we restrict the - // user that all streams must be captured in the same graph or not captured - // at all. Technically we could probably relax this, but that would mean - // collecting a different `ncclTasks` per graph and one for non-graph. - struct ncclCudaGraph capturingGraph; }; #endif diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h index 5796eb9fb..a0fb3a55f 100644 --- a/src/include/nccl_common.h +++ b/src/include/nccl_common.h @@ -7,8 +7,33 @@ #ifndef NCCL_DEBUG_H_ #define NCCL_DEBUG_H_ -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; -typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; +typedef enum { + NCCL_LOG_NONE = 0, + NCCL_LOG_VERSION = 1, + NCCL_LOG_WARN = 2, + NCCL_LOG_INFO = 3, + NCCL_LOG_ABORT = 4, + NCCL_LOG_TRACE = 5 +} ncclDebugLogLevel; + +typedef enum { + NCCL_INIT = 0x1, + NCCL_COLL = 0x2, + NCCL_P2P = 0x4, + NCCL_SHM = 0x8, + NCCL_NET = 0x10, + NCCL_GRAPH = 0x20, + NCCL_TUNING = 0x40, + NCCL_ENV = 0x80, + NCCL_ALLOC = 0x100, + NCCL_CALL = 0x200, + NCCL_PROXY = 0x400, + NCCL_NVLS = 0x800, + NCCL_BOOTSTRAP = 0x1000, + NCCL_REG = 0x2000, + NCCL_PROFILE = 0x4000, + NCCL_ALL = ~0 +} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); @@ -40,4 +65,5 @@ typedef enum { #define NCCL_PROTO_LL128 1 #define NCCL_PROTO_SIMPLE 2 +#define NCCL_ALGO_PROTO_IGNORE -1.0 #endif diff --git a/src/include/nccl_tuner.h b/src/include/nccl_tuner.h index 1917e2815..5cd02149f 100644 --- a/src/include/nccl_tuner.h +++ b/src/include/nccl_tuner.h @@ -11,6 +11,54 @@ #include "nccl.h" #include "nccl_common.h" +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // Inputs: + // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // - nNodes: number of nodes in current communicator. + // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. + // Outputs: + // - context: tuner context object + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - context: tuner context object + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - numPipeOps: number of operations in the group + // - numAlgo: number of algorithms in collCostTable + // - numProto: number of protocols in collCostTable + // + // Outputs: + // - nChannels: number of channels (hence SMs) to be used. + // + // InOut: + // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. + // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + // context: tuner context object + ncclResult_t (*destroy)(void* context); +} ncclTuner_v3_t; + +typedef ncclTuner_v3_t ncclTuner_t; + +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3" + // API to be implemented by external tuner typedef struct { // Name of the tuner @@ -36,7 +84,7 @@ typedef struct { // // Outputs: // - algorithm: selected algorithm to be used for the given collective - // - protocol: selected protocol to be used for the given collective + // - protocol: selected protocol to be used for the give collective // - nChannels: number of channels (hence SMs) to be used. // // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the @@ -46,15 +94,11 @@ typedef struct { // Unset fields will be set automatically by NCCL. ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, int collNetSupport, int nvlsSupport, int numPipeOps, - int *algorithm, int *protocol, int* nChannels); + int* algorithm, int* protocol, int* nChannels); // Terminates the plugin and cleans up any resources that the plugin allocated. // context: tuner context object ncclResult_t (*destroy)(void* context); } ncclTuner_v2_t; -typedef ncclTuner_v2_t ncclTuner_t; - -#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2" - #endif diff --git a/src/include/net.h b/src/include/net.h index b5df58968..d1926ccd8 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -14,8 +14,10 @@ typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; -ncclResult_t ncclNetPluginInit(); +ncclResult_t ncclNetPluginLoad(struct ncclComm* comm); +ncclResult_t ncclNetPluginUnload(struct ncclComm* comm); ncclResult_t ncclNetInit(struct ncclComm* comm); +ncclResult_t ncclNetFinalize(struct ncclComm* comm); int ncclNetVersion(struct ncclComm* comm); // Test whether the current GPU support GPU Direct RDMA. diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h index bad0b7937..7dee7d4ae 100644 --- a/src/include/nvmlwrap.h +++ b/src/include/nvmlwrap.h @@ -253,6 +253,38 @@ typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t; */ #define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2) +/** + * Confidential Compute Feature Status values + */ +#define NVML_CC_SYSTEM_FEATURE_DISABLED 0 +#define NVML_CC_SYSTEM_FEATURE_ENABLED 1 + +typedef struct nvmlConfComputeSystemState_st { + unsigned int environment; + unsigned int ccFeature; + unsigned int devToolsMode; +} nvmlConfComputeSystemState_t; + +/** + * Confidential Compute Multigpu mode values + */ +#define NVML_CC_SYSTEM_MULTIGPU_NONE 0 +#define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1 + +/** + * Confidential Compute System settings + */ +typedef struct { + unsigned int version; + unsigned int environment; + unsigned int ccFeature; + unsigned int devToolsMode; + unsigned int multiGpuMode; +} nvmlSystemConfComputeSettings_v1_t; + +typedef nvmlSystemConfComputeSettings_v1_t nvmlSystemConfComputeSettings_t; +#define nvmlSystemConfComputeSettings_v1 NVML_STRUCT_VERSION(SystemConfComputeSettings, 1) + /* End of nvml.h */ #endif // NCCL_NVML_DIRECT @@ -268,6 +300,11 @@ extern int ncclNvmlDeviceCount; extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices]; extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices]; +struct ncclNvmlCCStatus { + bool CCEnabled; + bool multiGpuCCEnabled; +}; + // All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly. // Outsiders need only call it if they want to inspect the ncclNvml global // tables above. @@ -283,5 +320,6 @@ ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* ma ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus); ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo); +ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status); #endif // End include guard diff --git a/src/include/nvtx.h b/src/include/nvtx.h index ab32ef27f..3bdfec59d 100644 --- a/src/include/nvtx.h +++ b/src/include/nvtx.h @@ -63,7 +63,7 @@ class payload_schema { nullptr, NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, - nullptr, 0, 0, 0}; + nullptr, 0, 0, 0, 0, nullptr}; }; // Create NVTX push/pop range with parameters diff --git a/src/include/nvtx3/nvToolsExt.h b/src/include/nvtx3/nvToolsExt.h index 10938385d..1ba00bef7 100644 --- a/src/include/nvtx3/nvToolsExt.h +++ b/src/include/nvtx3/nvToolsExt.h @@ -25,9 +25,9 @@ * * \section INITIALIZATION_SECTION Initialization * - * Typically the tool's library that plugs into NVTX is indirectly - * loaded via enviromental properties that are platform specific. - * For some platform or special cases, the user may be required + * Typically the tool's library that plugs into NVTX is indirectly + * loaded via enviromental properties that are platform specific. + * For some platform or special cases, the user may be required * to instead explicity initialize instead though. This can also * be helpful to control when the API loads a tool's library instead * of what would typically be the first function call to emit info. @@ -37,16 +37,16 @@ * * Markers and ranges are used to describe events at a specific time (markers) * or over a time span (ranges) during the execution of the application - * respectively. + * respectively. * * \subsection MARKERS Markers - * + * * Markers denote specific moments in time. - * - * + * + * * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on * how to specify the domain. - * + * * \subsection THREAD_RANGES Thread Ranges * * Thread ranges denote nested time ranges. Nesting is maintained per thread @@ -59,9 +59,9 @@ * * \subsection PROCESS_RANGES Process Ranges * - * Process ranges denote a time span that can expose arbitrary concurrency, as + * Process ranges denote a time span that can expose arbitrary concurrency, as * opposed to thread ranges that only support nesting. In addition the range - * start event can happen on a different thread than the end marker. For the + * start event can happen on a different thread than the end marker. For the * correlation of a start/end pair an unique correlation ID is used that is * returned from the start API call and needs to be passed into the end API * call. @@ -87,15 +87,15 @@ * * The function ::nvtxDomainCreateA or ::nvtxDomainCreateW is used to create * a named domain. - * + * * Each domain maintains its own * - categories * - thread range stacks * - registered strings * - * The function ::nvtxDomainDestroy marks the end of the domain. Destroying - * a domain unregisters and destroys all objects associated with it such as - * registered strings, resource objects, named categories, and started ranges. + * The function ::nvtxDomainDestroy marks the end of the domain. Destroying + * a domain unregisters and destroys all objects associated with it such as + * registered strings, resource objects, named categories, and started ranges. * * \section RESOURCE_NAMING Resource Naming * @@ -105,41 +105,41 @@ * The functions can be called multiple times during the execution of an * application, however, in that case it is implementation dependent which * name will be reported by the tool. - * + * * \subsection CATEGORY_NAMING Category Naming * - * Some function in this library support associating an integer category - * to enable filtering and sorting. The category naming functions allow - * the application to associate a user friendly name with the integer - * category. Support for domains have been added in NVTX_VERSION_2 to - * avoid collisions when domains are developed independantly. + * Some function in this library support associating an integer category + * to enable filtering and sorting. The category naming functions allow + * the application to associate a user friendly name with the integer + * category. Support for domains have been added in NVTX_VERSION_2 to + * avoid collisions when domains are developed independantly. * * \subsection RESOURCE_OBJECTS Resource Objects * - * Resource objects are a generic mechanism for attaching data to an application - * resource. The identifier field makes the association to a pointer or handle, - * while the type field helps provide deeper understanding of the identifier as + * Resource objects are a generic mechanism for attaching data to an application + * resource. The identifier field makes the association to a pointer or handle, + * while the type field helps provide deeper understanding of the identifier as * well as enabling differentiation in cases where handles generated by different * APIs may collide. The resource object may also have an associated message to - * associate with the application resource, enabling further annotation of this + * associate with the application resource, enabling further annotation of this * object and how it is used. - * + * * The resource object was introduced in NVTX_VERSION_2 to supersede existing naming * functions and allow the application resource identified by those functions to be * associated to a domain. The other naming functions are still supported for backward * compatibility but will be associated only to the default domain. * * \subsection RESOURCE_NAMING_OS Resource Naming - * - * Some operating system resources creation APIs do not support providing a user friendly - * name, such as some OS thread creation APIs. This API support resource naming though - * both through resource objects and functions following the pattern - * nvtxName[RESOURCE_TYPE][A|W](identifier, name). Resource objects introduced in NVTX_VERSION 2 + * + * Some operating system resources creation APIs do not support providing a user friendly + * name, such as some OS thread creation APIs. This API support resource naming though + * both through resource objects and functions following the pattern + * nvtxName[RESOURCE_TYPE][A|W](identifier, name). Resource objects introduced in NVTX_VERSION 2 * supersede the other functions with a a more general method of assigning names to OS resources, - * along with associating them to domains too. The older nvtxName* functions are only associated + * along with associating them to domains too. The older nvtxName* functions are only associated * with the default domain. * \section EXTENSIONS Optional Extensions - * Optional extensions will either appear within the existing sections the extend or appear + * Optional extensions will either appear within the existing sections the extend or appear * in the "Related Pages" when they introduce new concepts. */ @@ -159,7 +159,11 @@ #define NVTX_INLINE_STATIC __inline static #else /*defined(__GNUC__)*/ #define NVTX_API +#if defined(__cplusplus) || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) #define NVTX_INLINE_STATIC inline static +#else +#define NVTX_INLINE_STATIC __inline__ static +#endif #endif /* Platform */ #if defined(NVTX_NO_IMPL) @@ -212,7 +216,7 @@ extern "C" { #endif /* __cplusplus */ -/** +/** * Result Codes */ @@ -281,12 +285,12 @@ typedef enum nvtxColorType_t * ------------------------------------------------------------------------- */ typedef enum nvtxMessageType_t { - NVTX_MESSAGE_UNKNOWN = 0, /**< Message payload is unused. */ + NVTX_MESSAGE_UNKNOWN = 0, /**< Message attribute is unused. */ NVTX_MESSAGE_TYPE_ASCII = 1, /**< A character sequence is used as payload. */ NVTX_MESSAGE_TYPE_UNICODE = 2, /**< A wide character sequence is used as payload. */ /* NVTX_VERSION_2 */ NVTX_MESSAGE_TYPE_REGISTERED = 3, /**< A unique string handle that was registered - with \ref nvtxDomainRegisterStringA() or + with \ref nvtxDomainRegisterStringA() or \ref nvtxDomainRegisterStringW(). */ } nvtxMessageType_t; @@ -338,7 +342,7 @@ NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved); * ------------------------------------------------------------------------- */ typedef enum nvtxPayloadType_t { - NVTX_PAYLOAD_UNKNOWN = 0, /**< Color payload is unused. */ + NVTX_PAYLOAD_UNKNOWN = 0, /**< Payload attribute is unused. */ NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 = 1, /**< A 64 bit unsigned integer value is used as payload. */ NVTX_PAYLOAD_TYPE_INT64 = 2, /**< A 64 bit signed integer value is used as payload. */ NVTX_PAYLOAD_TYPE_DOUBLE = 3, /**< A 64 bit floating point value is used as payload. */ @@ -714,10 +718,10 @@ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message); /* ------------------------------------------------------------------------- */ /** \brief Ends a process range. * -* \param domain - The domain +* \param domain - The domain * \param id - The correlation ID returned from a nvtxRangeStart call. * -* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd. +* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd. * It does not need a domain param since that is associated iwth the range ID at ::nvtxDomainRangeStartEx * * \par Example: @@ -929,10 +933,10 @@ NVTX_DECLSPEC int NVTX_API nvtxRangePop(void); /* ------------------------------------------------------------------------- */ /** \cond SHOW_HIDDEN -* \brief Resource typing helpers. +* \brief Resource typing helpers. * -* Classes are used to make it easy to create a series of resource types -* per API without collisions +* Classes are used to make it easy to create a series of resource types +* per API without collisions */ #define NVTX_RESOURCE_MAKE_TYPE(CLASS, INDEX) ((((uint32_t)(NVTX_RESOURCE_CLASS_ ## CLASS))<<16)|((uint32_t)(INDEX))) #define NVTX_RESOURCE_CLASS_GENERIC 1 @@ -1062,7 +1066,7 @@ typedef struct nvtxResourceAttributes_v0 int32_t identifierType; /* values from enums following the pattern nvtxResource[name]Type_t */ /** - * \brief Identifier for the resource. + * \brief Identifier for the resource. * \anchor RESOURCE_IDENTIFIER_FIELD * * An identifier may be a pointer or a handle to an OS or middleware API object. @@ -1093,7 +1097,7 @@ typedef struct nvtxResourceAttributes_v0 typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t; -/* \cond SHOW_HIDDEN +/* \cond SHOW_HIDDEN * \version \NVTX_VERSION_2 */ #define NVTX_RESOURCE_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxResourceAttributes_v0) ) ) @@ -1106,7 +1110,7 @@ typedef struct nvtxResourceHandle* nvtxResourceHandle_t; /** \brief Create a resource object to track and associate data with OS and middleware objects * * Allows users to associate an API handle or pointer with a user-provided name. -* +* * * \param domain - Domain to own the resource object * \param attribs - Attributes to be associated with the resource @@ -1240,7 +1244,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* * POSIX pthread_t type returned by pthread_self() may not comply with these * expectations. Please use OS-specific thread ID instead of pthread_t. * - * The thread name is associated to the default domain. To support domains + * The thread name is associated to the default domain. To support domains * use resource objects via ::nvtxDomainResourceCreate. * * \param threadId - The ID of the thread to name. @@ -1457,7 +1461,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain); } /* extern "C" */ #endif /* __cplusplus */ -#define NVTX_IMPL_GUARD /* Ensure other headers cannot included directly */ +#define NVTX_IMPL_GUARD /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxTypes.h" diff --git a/src/include/nvtx3/nvToolsExtCounters.h b/src/include/nvtx3/nvToolsExtCounters.h new file mode 100644 index 000000000..00e2b7f8f --- /dev/null +++ b/src/include/nvtx3/nvToolsExtCounters.h @@ -0,0 +1,335 @@ +/** + * The NVTX counters extension is intended to collect counter values of various + * sources. It uses the NVTX payload extension to specify the data layout a + * counter group. + * + * A counter group is a set of counters that are collected together (at the same + * time). Counters are always registered as a group. Hence, a single counter is + * represented by a group with one counter. + * + * A sample refers to all values for a given timestamp. These values must + * include counter values and may include multiple instances of a counter group. + * + * The NVTX domain handle is the first argument to all counter collect + * functions. 0/NULL/nullptr represents the default domain (no domain). + */ + +#include "nvToolsExtPayload.h" + +#ifndef NVTOOLSEXT_COUNTERS_H +#define NVTOOLSEXT_COUNTERS_H + +/** + * \brief The compatibility ID is used for versioning of this extension. + */ +#ifndef NVTX_EXT_COUNTERS_COMPATID +#define NVTX_EXT_COUNTERS_COMPATID 0x0101 +#endif + +/** + * \brief The module ID identifies the payload extension. It has to be unique + * among the extension modules. + */ +#ifndef NVTX_EXT_COUNTERS_MODULEID +#define NVTX_EXT_COUNTERS_MODULEID 4 +#endif + + +/** Identifies an invalid scope and indicates an error if returned by `nvtxScopeRegister`. */ +#define NVTX_SCOPE_NONE 0 /* no scope */ + +#define NVTX_SCOPE_ROOT 1 + +#define NVTX_SCOPE_CURRENT_HW_MACHINE 2 /* Node/machine name, Device? */ +#define NVTX_SCOPE_CURRENT_HW_SOCKET 3 +#define NVTX_SCOPE_CURRENT_HW_CPU 4 +#define NVTX_SCOPE_CURRENT_HW_CPU_LOGICAL 5 +/* Innermost HW execution context at registration time */ +#define NVTX_SCOPE_CURRENT_HW_INNERMOST 6 + +/* Virtualized hardware, virtual machines, OS (if you don't know any better) */ +#define NVTX_SCOPE_CURRENT_HYPERVISOR 7 +#define NVTX_SCOPE_CURRENT_VM 8 +#define NVTX_SCOPE_CURRENT_KERNEL 9 +#define NVTX_SCOPE_CURRENT_CONTAINER 10 +#define NVTX_SCOPE_CURRENT_OS 11 + +/* Software scopes */ +#define NVTX_SCOPE_CURRENT_SW_PROCESS 12 /* Process scope */ +#define NVTX_SCOPE_CURRENT_SW_THREAD 13 /* Thread scope */ +#define NVTX_SCOPE_CURRENT_SW_FIBER 14 +/* Innermost SW execution context at registration time */ +#define NVTX_SCOPE_CURRENT_SW_INNERMOST 15 + +/** Static (user-provided) scope IDs (feed forward) */ +#define NVTX_SCOPE_ID_STATIC_START (1 << 24) + +/** Dynamically (tool) generated scope IDs */ +#define NVTX_SCOPE_ID_DYNAMIC_START 4294967296 /* 1 << 32 */ + + +/** Identifier of the semantic extension for counters. */ +#define NVTX_SEMANTIC_ID_COUNTERS_V1 5 + +/*** Flags to augment the counter value. ***/ +#define NVTX_COUNTERS_FLAG_NONE 0 + +/** + * Convert the fixed point value to a normalized floating point. + * Use the sign/unsign from the underlying type this flag is applied to. + * Unsigned [0f : 1f] or signed [-1f : 1f] + */ +#define NVTX_COUNTERS_FLAG_NORM (1 << 1) + +/** + * Tools should apply scale and limits when graphing, ideally in a "soft" way to + * to see when limits are exceeded. + */ +#define NVTX_COUNTERS_FLAG_LIMIT_MIN (1 << 2) +#define NVTX_COUNTERS_FLAG_LIMIT_MAX (1 << 3) +#define NVTX_COUNTERS_FLAG_LIMITS \ + (NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX) + +/** Counter time scope **/ +#define NVTX_COUNTERS_FLAG_TIME_POINT (1 << 5) +#define NVTX_COUNTERS_FLAG_TIME_SINCE_LAST (2 << 5) +#define NVTX_COUNTERS_FLAG_TIME_UNTIL_NEXT (3 << 5) +#define NVTX_COUNTERS_FLAG_TIME_SINCE_START (4 << 5) + +/** Counter value type **/ +#define NVTX_COUNTERS_FLAG_VALUE_ABSOLUTE (1 << 10) +#define NVTX_COUNTERS_FLAG_VALUE_DELTA (2 << 10) // delta to previous counter sample + +/** Counter visualization hints **/ +#define NVTX_COUNTERS_FLAG_INTERPOLATE (1 << 14) + +/** Datatypes for limits union (value of `limitType`). */ +#define NVTX_COUNTERS_LIMIT_I64 0 +#define NVTX_COUNTERS_LIMIT_U64 1 +#define NVTX_COUNTERS_LIMIT_F64 2 + +/** Reasons for the missing sample value. */ +#define NVTX_COUNTERS_SAMPLE_ZERO 0 +#define NVTX_COUNTERS_SAMPLE_UNCHANGED 1 +#define NVTX_COUNTERS_SAMPLE_UNAVAILABLE 2 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** + * \brief Specify additional properties of a counter or counter group. + */ +typedef struct nvtxSemanticsCounter_v1 +{ + /** Header of the semantic extension (with identifier, version, etc.). */ + struct nvtxSemanticsHeader_v1 header; + + /** + * Flag if normalization, scale limits, etc. should be applied to counter + * values. + */ + uint64_t flags; + + /** Unit of the counter value (case insensitive) */ + const char* unit; + + /** Should be 1 if not used. */ + uint64_t unitScaleNumerator; + + /** Should be 1 if not used. */ + uint64_t unitScaleDenominator; + + /** Determines the used union member. Use defines `NVTX_COUNTERS_LIMIT_*`. */ + int64_t limitType; + + /** Soft graph limit. */ + union limits_t { + int64_t i64[2]; + uint64_t u64[2]; + double d[2]; + } limits; +} nvtxSemanticsCounter_t; + +typedef struct nvtxCountersAttr_v1 +{ + size_t structSize; + + /** + * A schema ID referring to the data layout of the counter group or a + * predefined NVTX payloads number type. + */ + uint64_t schemaId; + + /** Name of the counter group. */ + const char* name; + + /** Identifier of the scope of the counters. */ + uint64_t scopeId; + + /** + * (Optional) Specify additional semantics for a counter (group). The + * semantics provided are applied to the all counters in a group. If the + * semantics should only refer to a single counter in a group, the semantics + * field of the payload entry has to be used. Accepted semantics are + * `nvtxSemanticsCounter_t` and `nvtxSemanticsTime_t`. + */ + const nvtxSemanticsHeader_t* semantics; +} nvtxCountersAttr_t; + +/* Forward declaration of opaque counter group registration structure */ +struct nvtxCountersRegistration_st; +typedef struct nvtxCountersRegistration_st nvtxCountersRegistration; + +/* \brief Counters Handle Structure. +* \anchor COUNTERS_HANDLE_STRUCTURE +* +* This structure is opaque to the user and is used as a handle to reference a counter group. +* This type is returned from tools when using the NVTX API to create a counters group. +*/ +typedef nvtxCountersRegistration* nvtxCountersHandle_t; + +typedef struct nvtxCountersBatch_v1 +{ + /** Handle to attributes (data layout, scope, etc.) of a counter (group). */ + nvtxCountersHandle_t hCounter; + + /** Array of counter samples. */ + const void* counters; + + /** Size of the `counters` array (in bytes). */ + size_t cntArrSize; + + /** Array of timestamps or reference-time plus delta pair. `NULL` is used, if + timestamps are part of the counter (group) layout.) */ + const void* timestamps; + + /** Size of the `timestamps` array or definition (in bytes). */ + size_t tsSize; +} nvtxCountersBatch_t; + +/** + * \brief Register a counter group. + * + * @param hDomain NVTX domain handle. + * @param attr Pointer to the attributes of the counter (group). + * + * @return Counter handle identifying a counter or counter (group). + * The counter handle is unique within the NVTX domain. + */ +NVTX_DECLSPEC nvtxCountersHandle_t NVTX_API nvtxCountersRegister( + nvtxDomainHandle_t hDomain, + const nvtxCountersAttr_t* attr); + +/** + * \brief Sample one integer counter by value immediately (the NVTX tool determines the timestamp). + * + * @param hDomain handle of the NVTX domain. + * @param hCounter handle of the NVTX counter (group). + * @param value 64-bit integer counter value. + */ +NVTX_DECLSPEC void NVTX_API nvtxCountersSampleInt64( + nvtxDomainHandle_t hDomain, + nvtxCountersHandle_t hCounter, + int64_t value); + +/** + * \brief Sample one floating point counter by value immediately (the NVTX tool determines the timestamp). + * + * @param hDomain handle of the NVTX domain. + * @param hCounter handle of the NVTX counter (group). + * @param value 64-bit floating-point counter value. + */ +NVTX_DECLSPEC void NVTX_API nvtxCountersSampleFloat64( + nvtxDomainHandle_t hDomain, + nvtxCountersHandle_t hCounter, + double value); + +/** + * \brief Sample a counter group by reference immediately (the NVTX tool determines the timestamp). + * + * @param hDomain handle of the NVTX domain. + * @param hCounter handle of the NVTX counter (group). + * @param counters pointer to one or more counter values. + * @param size size of the counter value(s) in bytes. + */ +NVTX_DECLSPEC void NVTX_API nvtxCountersSample( + nvtxDomainHandle_t hDomain, + nvtxCountersHandle_t hCounter, + void* values, + size_t size); + +/** + * \brief Sample without value. + * + * @param hDomain handle of the NVTX domain. + * @param hCounter handle of the NVTX counter (group). + * @param reason reason for the missing sample value. + */ +NVTX_DECLSPEC void NVTX_API nvtxCountersSampleNoValue( + nvtxDomainHandle_t hDomain, + nvtxCountersHandle_t hCounter, + uint8_t reason); + +/** + * \brief Submit a batch of counters in the given domain. + * Timestamps are part of the counter sample data. + * + * The size of a data sampling point is defined by the `staticSize` field of the + * payload schema. An NVTX tool can assume that the counter samples are stored + * as an array with each entry being `staticSize` bytes. + * + * @param hDomain handle of the NVTX domain + * @param hCounter handle of the counter group (includes counter data decoding schema) + * @param counters blob containing counter data and timestamps + * @param size size of the counter data blob in bytes + */ +NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatch( + nvtxDomainHandle_t hDomain, + nvtxCountersHandle_t hCounter, + const void* counters, + size_t size); + +/** + * \brief Submit a batch of counters in the given domain. + * Timestamps are separated from the counter data. + * + * @param hDomain handle of the NVTX domain + * @param counterBatch Pointer to the counter data to be submitted. + */ +NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatchEx( + nvtxDomainHandle_t hDomain, + const nvtxCountersBatch_t* counterBatch); + + +#define NVTX3EXT_CBID_nvtxCountersRegister 0 +#define NVTX3EXT_CBID_nvtxCountersSampleInt64 1 +#define NVTX3EXT_CBID_nvtxCountersSampleFloat64 2 +#define NVTX3EXT_CBID_nvtxCountersSample 3 +#define NVTX3EXT_CBID_nvtxCountersSampleNoValue 4 +#define NVTX3EXT_CBID_nvtxCountersSubmitBatch 5 +#define NVTX3EXT_CBID_nvtxCountersSubmitBatchEx 6 + +#ifdef __GNUC__ +#pragma GCC visibility push(internal) +#endif + +#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot be included directly */ +#include "nvtxDetail/nvtxExtTypes.h" +#undef NVTX_EXT_TYPES_GUARD + +#ifndef NVTX_NO_IMPL +#define NVTX_EXT_IMPL_COUNTERS_GUARD /* Ensure other headers cannot be included directly */ +#include "nvtxDetail/nvtxExtImplCounters_v1.h" +#undef NVTX_EXT_IMPL_COUNTERS_GUARD +#endif /*NVTX_NO_IMPL*/ + +#ifdef __GNUC__ +#pragma GCC visibility pop +#endif + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* NVTOOLSEXT_COUNTERS_H */ \ No newline at end of file diff --git a/src/include/nvtx3/nvToolsExtCuda.h b/src/include/nvtx3/nvToolsExtCuda.h index b1b80ad67..de9aa9d48 100644 --- a/src/include/nvtx3/nvToolsExtCuda.h +++ b/src/include/nvtx3/nvToolsExtCuda.h @@ -30,7 +30,7 @@ extern "C" { */ /* ------------------------------------------------------------------------- */ -/* \cond SHOW_HIDDEN +/* \cond SHOW_HIDDEN * \brief Used to build a non-colliding value for resource types separated class * \version \NVTX_VERSION_2 */ @@ -133,7 +133,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name) #endif /* __cplusplus */ #ifndef NVTX_NO_IMPL -#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */ +#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxImplCuda_v3.h" #undef NVTX_IMPL_GUARD_CUDA #endif /*NVTX_NO_IMPL*/ diff --git a/src/include/nvtx3/nvToolsExtCudaRt.h b/src/include/nvtx3/nvToolsExtCudaRt.h index 1e19958ec..6a85da816 100644 --- a/src/include/nvtx3/nvToolsExtCudaRt.h +++ b/src/include/nvtx3/nvToolsExtCudaRt.h @@ -31,7 +31,7 @@ extern "C" { */ /* ------------------------------------------------------------------------- */ -/* \cond SHOW_HIDDEN +/* \cond SHOW_HIDDEN * \brief Used to build a non-colliding value for resource types separated class * \version \NVTX_VERSION_2 */ @@ -109,7 +109,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* #endif /* __cplusplus */ #ifndef NVTX_NO_IMPL -#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */ +#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxImplCudaRt_v3.h" #undef NVTX_IMPL_GUARD_CUDART #endif /*NVTX_NO_IMPL*/ diff --git a/src/include/nvtx3/nvToolsExtMem.h b/src/include/nvtx3/nvToolsExtMem.h new file mode 100644 index 000000000..3b3406e35 --- /dev/null +++ b/src/include/nvtx3/nvToolsExtMem.h @@ -0,0 +1,694 @@ +/* +* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#include "nvToolsExt.h" + +#ifndef NVTOOLSEXTV3_MEM_V1 +#define NVTOOLSEXTV3_MEM_V1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#define NVTX_EXT_MODULEID_MEM 1 + +/* \cond SHOW_HIDDEN + * \brief A compatibility ID value used in structures and initialization to + * identify version differences. + */ +#define NVTX_EXT_COMPATID_MEM 0x0102 + +/* \cond SHOW_HIDDEN + * \brief This value is returned by functions that return `nvtxMemHeapHandle_t`, + * if a tool is not attached. + */ +#define NVTX_MEM_HEAP_HANDLE_NO_TOOL ((nvtxMemHeapHandle_t)(intptr_t)-1) + +/* \cond SHOW_HIDDEN + * \brief This value is returned by functions that return `nvtxMemRegionHandle_t` + * if a tool is not attached. + */ +#define NVTX_MEM_REGION_HANDLE_NO_TOOL ((nvtxMemRegionHandle_t)(intptr_t)-1) + +/* \cond SHOW_HIDDEN + * \brief This value is returned by functions that return `nvtxMemPermissionsHandle_t` + * if a tool is not attached. + */ +#define NVTX_MEM_PERMISSIONS_HANDLE_NO_TOOL ((nvtxMemPermissionsHandle_t)-1) + + +/* \cond SHOW_HIDDEN + * \brief This should not be used and is considered an error but defined to + * detect an accidental use of zero or NULL. + */ +#define NVTX_MEM_HEAP_USAGE_UNKNOWN 0x0 + + +/* \cond SHOW_HIDDEN + * \brief This should not be used and is considered an error but defined to + * detect an accidental use of zero or NULL. + */ +#define NVTX_MEM_TYPE_UNKNOWN 0x0 + + +/* ------------------------------------------------------------------------- */ +/** \defgroup MEMORY Memory + * See page \ref PAGE_MEMORY. + * @{ + */ + +/** + * \brief To indicate the full process virtual address space as a heap for + * functions where a nvtxMemHeapHandle_t is accepted. + * + * The heap by default is always read-write-execute permissions without creating regions. + * Regions created in this heap have read-write access by default but not execute. + */ +#define NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE ((nvtxMemHeapHandle_t)0) + +/** \brief This heap is a sub-allocator. + * + * Heap created with this usage should not be accessed by the user until regions are registered. + * Regions from a heap with this usage have read-write access by default but not execute. + */ +#define NVTX_MEM_HEAP_USAGE_TYPE_SUB_ALLOCATOR 0x1 + +/** + * \brief This is a heap of memory that has an explicit layout. + * + * The layout could be static or dynamic (calculated). This often represents an algorithm's + * structures that are packed together. By default this heap is assumed to be accessible for + * scopes where the memory is naturally accessible by hardware. Regions may be use to further + * annotate or restrict access. A tool may have an option to be more strict, but special + * consideration must be made for `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`. + * + * The behavior of this usage is similar to NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE but + * a tool can use it to track special behaviors and reservation. + * + * Memory in a heap with this usage has read-write permissions by default but not execute without + * creating regions. Regions created in this heap have the same default permission access. + */ +#define NVTX_MEM_HEAP_USAGE_TYPE_LAYOUT 0x2 + + +/** + * \brief Standard process userspace virtual addresses for linear allocations. + * + * APIs that map into this space, such as CUDA UVA should use this type. + * + * Relevant functions: cudaMalloc, cudaMallocManaged, cudaHostAlloc, cudaMallocHost + * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is supported + * + * nvtxMemHeapRegister receives a heapDesc of type nvtxMemVirtualRangeDesc_t + */ +#define NVTX_MEM_TYPE_VIRTUAL_ADDRESS 0x1 + + +/** + * \brief To indicate you are modifying permissions to the process-wide + * full virtual address space. + * + * This is a companion object to `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`. + */ +#define NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE ((nvtxMemPermissionsHandle_t)0) + +#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_NONE 0x0 +#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_READ 0x1 +#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE 0x2 +#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_ATOMIC 0x4 + + +/* \cond SHOW_HIDDEN + * \brief Forward declaration of opaque memory heap structure. + */ +struct nvtxMemHeap_v1; +typedef struct nvtxMemHeap_v1 nvtxMemHeap_t; + +/** \brief A handle returned by a tool to represent a memory heap. */ +typedef nvtxMemHeap_t* nvtxMemHeapHandle_t; + +/* \cond SHOW_HIDDEN + * \brief Forward declaration of opaque memory heap structure. + */ +struct nvtxMemRegion_v1; +typedef struct nvtxMemRegion_v1 nvtxMemRegion_t; + +/** \brief A handle returned by a tool to represent a memory region. */ +typedef nvtxMemRegion_t* nvtxMemRegionHandle_t; + +/** \brief A reference to a memory region (by pointer or handle). + * Which member of the union will be determined by a type or flag field outside. + */ +typedef union nvtxMemRegionRef_t +{ + void const* pointer; + nvtxMemRegionHandle_t handle; +} nvtxMemRegionRef_t; + +/* \cond SHOW_HIDDEN + * \brief Forward declaration of opaque memory permissions structure + */ +struct nvtxMemPermissions_v1; +typedef struct nvtxMemPermissions_v1 nvtxMemPermissions_t; + +/** \brief A handle returned by a tool to represent a memory permissions mask. */ +typedef nvtxMemPermissions_t* nvtxMemPermissionsHandle_t; + + +typedef struct nvtxMemVirtualRangeDesc_v1 +{ + size_t size; + void const* ptr; +} nvtxMemVirtualRangeDesc_v1 ; +typedef nvtxMemVirtualRangeDesc_v1 nvtxMemVirtualRangeDesc_t; + + +/** \brief structure to describe a heap in process virtual memory. */ +typedef struct nvtxMemHeapDesc_v1 +{ + uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ + uint16_t structSize; /* Size of the structure. */ + uint32_t reserved0; + + /** \brief Usage characteristics of the heap + * + * Usage characteristics help tools like memcheckers, santiizer, + * as well as other debugging and profiling tools to determine some + * special behaviors they should apply to the heap and it's regions. + * The value follows the convention NVTX_MEM_HEAP_USAGE_* + * + * Default Value is 0, which is invalid. + */ + uint32_t usage; + + /** \brief Memory type characteristics of the heap + * + * The 'type' indicates how to interpret the ptr field of the heapDesc. + * This is intended to support many additional types of memory, beyond + * standard process virtual memory, such as API specific memory only + * addressed by handles or multi-dimensional memory requiring more complex + * descriptions to handle features like strides, tiling, or interlace. + * + * The values conforms to NVTX_MEM_TYPE_* + * + * The value in the field 'type' identifies the descriptor type that will + * be in the field 'typeSpecificDesc'. 'typeSpecificDesc' is void* because + * it is extensible. Example usage is if type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS, + * then typeSpecificDesc points to a nvtxMemVirtualRangeDesc_t. + * + * Default Value is 0, which is invalid. + */ + uint32_t type; + + /** \brief size of the heap memory descriptor pointed to by typeSpecificDesc + * + * Default Value is 0 which is invalid. + */ + size_t typeSpecificDescSize; + + /** \brief Pointer to the heap memory descriptor + * + * The value in the field 'type' identifies the descriptor type that will + * be in the field 'typeSpecificDesc'. 'typeSpecificDesc' is void* because + * it is extensible. Example usage is if type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS, + * then typeSpecificDesc points to a nvtxMemVirtualRangeDesc_t. + * + * Default Value is 0, which is invalid. + */ + void const* typeSpecificDesc; + + /** \brief ID of the category the event is assigned to. + * + * A category is a user-controlled ID that can be used to group + * events. The tool may use category IDs to improve filtering or + * enable grouping of events in the same category. The functions + * \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used + * to name a category. + * + * Default Value is 0. + */ + uint32_t category; + + /** \brief Message type specified in this attribute structure. + * + * Defines the message format of the attribute structure's \ref MESSAGE_FIELD + * "message" field. + * + * Default Value is `NVTX_MESSAGE_UNKNOWN`. + */ + uint32_t messageType; /* nvtxMessageType_t */ + + /** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD + * + * The text message that is attached to an event. + */ + nvtxMessageValue_t message; + +} nvtxMemHeapDesc_v1 ; +typedef nvtxMemHeapDesc_v1 nvtxMemHeapDesc_t; + +/** + * \brief Create a memory heap to represent a object or range of memory that will be further + * sub-divided into regions. + * + * The handle used to addrss the heap will depend on the heap's type. Where the heap is virtual + * memory accessible, the addrss of the heap's memory itself is it's handle. This will likewise + * be returned from the function. + * + * For more advanced types, where the heap is not virtual memory accessible the tools may be + * responsible for returning a void const * that that uniquely identifies the object. Please see + * the description of each heap type for more details on whether this is expected to be a uniquely + * generated by the tool or otherwise. + */ +NVTX_DECLSPEC nvtxMemHeapHandle_t NVTX_API nvtxMemHeapRegister( + nvtxDomainHandle_t domain, + nvtxMemHeapDesc_t const* desc); + + /** \brief Destroy a memory heap. */ +NVTX_DECLSPEC void NVTX_API nvtxMemHeapUnregister( + nvtxDomainHandle_t domain, + nvtxMemHeapHandle_t heap);/* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported */ + +/** + * \brief Reset the memory heap wipes out any changes, as if it were a fresh heap. + * + * This includes invalidating all regions and their handles. + */ +NVTX_DECLSPEC void NVTX_API nvtxMemHeapReset( + nvtxDomainHandle_t domain, + nvtxMemHeapHandle_t heap); /* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is supported */ + +/** + * \brief Register a region of memory inside of a heap. + * + * The heap refers the the heap within which the region resides. This can be from + * `nvtxMemHeapRegister`, `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`, or one provided + * from other extension API. + * + * The regionType arg will define which type is used in regionDescArray. + * The most commonly used type is `NVTX_MEM_TYPE_VIRTUAL_ADDRESS`. + * In this case regionDescElements is an array of `nvtxMemVirtualRangeDesc_t`. + * + * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut. + * + * The regionHandleArrayOut arg points to an array where the tool will provide region handles. If + * a pointer is provided, it is expected to have regionCount elements. This pointer can be NULL if + * regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, the user can use the pointer to the + * virtual memory to reference the region in other related functions which accept nvtMemRegionRef_t. + */ +typedef struct nvtxMemRegionsRegisterBatch_v1 +{ + uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ + uint16_t structSize; /* Size of the structure. */ + + uint32_t regionType; /* NVTX_MEM_TYPE_* */ + + nvtxMemHeapHandle_t heap; + + size_t regionCount; + size_t regionDescElementSize; + void const* regionDescElements; /* This will also become the handle for this region. */ + nvtxMemRegionHandle_t* regionHandleElementsOut; /* This will also become the handle for this region. */ + +} nvtxMemRegionsRegisterBatch_v1; +typedef nvtxMemRegionsRegisterBatch_v1 nvtxMemRegionsRegisterBatch_t; + + /** \brief Register a region of memory inside of a heap of linear process virtual memory + */ +NVTX_DECLSPEC void NVTX_API nvtxMemRegionsRegister( + nvtxDomainHandle_t domain, + nvtxMemRegionsRegisterBatch_t const* desc); + + + +/** + * \brief Register a region of memory inside of a heap. + * + * The heap refers the the heap within which the region resides. + * This can be from nvtxMemHeapRegister, NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE, or + * one provided from other extension API. + * + * The regionType arg will define which type is used in regionDescArray. + * The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. + * + * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut. + * + * The regionHandleArrayOut arg points to an array where the tool will provide region handles. If + * a pointer if provided, it is expected to have regionCount elements. This pointer can be NULL if + * regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, the user can use the pointer to the + * virtual memory to reference the region in other related functions which accept nvtMemRegionRef_t. + */ +typedef struct nvtxMemRegionsResizeBatch_v1 +{ + uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ + uint16_t structSize; /* Size of the structure. */ + + uint32_t regionType; /* NVTX_MEM_TYPE_* */ + + size_t regionDescCount; + size_t regionDescElementSize; + void const* regionDescElements; /* This will also become the handle for this region. */ + +} nvtxMemRegionsResizeBatch_v1; +typedef nvtxMemRegionsResizeBatch_v1 nvtxMemRegionsResizeBatch_t; + + /** \brief Register a region of memory inside of a heap of linear process virtual memory + */ +NVTX_DECLSPEC void NVTX_API nvtxMemRegionsResize( + nvtxDomainHandle_t domain, + nvtxMemRegionsResizeBatch_t const* desc); + + +#define NVTX_MEM_REGION_REF_TYPE_UNKNOWN 0x0 +#define NVTX_MEM_REGION_REF_TYPE_POINTER 0x1 +#define NVTX_MEM_REGION_REF_TYPE_HANDLE 0x2 + +/** + * \brief Register a region of memory inside of a heap. + * + * The heap refers the the heap within which the region resides. + * This can be from nvtxMemHeapRegister, `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`, or + * one provided from other extension API. + * + * The regionType arg will define which type is used in `regionDescArray`. + * The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. + * + * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut. + * + * The regionHandleArrayOut arg points to an array where the tool will provide region handles. + * If a pointer if provided, it is expected to have regionCount elements. + * This pointer can be NULL if regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, + * the user can use the pointer to the virtual memory to reference the region in other + * related functions which accept a nvtMemRegionRef_t. + */ +typedef struct nvtxMemRegionsUnregisterBatch_v1 +{ + uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ + uint16_t structSize; /* Size of the structure. */ + + uint32_t refType; /* NVTX_MEM_REGION_REF_TYPE_* */ + + size_t refCount; /* count of elements in refArray */ + size_t refElementSize; + nvtxMemRegionRef_t const* refElements; /* This will also become the handle for this region. */ + +} nvtxMemRegionsUnregisterBatch_v1; +typedef nvtxMemRegionsUnregisterBatch_v1 nvtxMemRegionsUnregisterBatch_t; + +/** + * \brief Unregistration for regions of process virtual memory + * + * This is not necessary if the nvtx heap destroy function has been called that + * contains this object. + */ +NVTX_DECLSPEC void NVTX_API nvtxMemRegionsUnregister( + nvtxDomainHandle_t domain, + nvtxMemRegionsUnregisterBatch_t const* desc); + +typedef struct nvtxMemRegionNameDesc_v1 +{ + uint32_t regionRefType; /* NVTX_MEM_REGION_REF_TYPE_* */ + uint32_t nameType; /* nvtxMessageType_t */ + + nvtxMemRegionRef_t region; + nvtxMessageValue_t name; + + uint32_t category; + uint32_t reserved0; +} nvtxMemRegionNameDesc_v1; +typedef nvtxMemRegionNameDesc_v1 nvtxMemRegionNameDesc_t; + + +typedef struct nvtxMemRegionsNameBatch_v1 +{ + uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ + uint16_t structSize; /* Size of the structure. */ + + uint32_t reserved0; + + size_t regionCount; + size_t regionElementSize; + nvtxMemRegionNameDesc_t const* regionElements; + size_t reserved1; +} nvtxMemRegionsNameBatch_v1 ; +typedef nvtxMemRegionsNameBatch_v1 nvtxMemRegionsNameBatch_t; + + + /** \brief Name or rename a region. */ +NVTX_DECLSPEC void NVTX_API nvtxMemRegionsName( + nvtxDomainHandle_t domain, + nvtxMemRegionsNameBatch_t const* desc); + +/** \brief There are no permissions for this memory. */ +#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_NONE 0x0 + +/** \brief The memory is readable. */ +#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_READ 0x1 + +/** \brief The memory is writable. */ +#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_WRITE 0x2 + +/** \brief The memory is for atomic RW. */ +#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_ATOMIC 0x4 + +/** + * \brief The memory access permissions are reset for a region. + * + * This is as if never set, rather than documented defaults. As as result any flags + * indicating how unspecified regions are handle will affect this area. + * + * This should not be used with READ, WRITE, nor ATOMIC, as those flags would have no effect. + */ +#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_RESET 0x8 + + +typedef struct nvtxMemPermissionsAssignRegionDesc_v1 +{ + uint32_t flags; /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */ + uint32_t regionRefType; /* NVTX_MEM_REGION_REF_TYPE_* */ + nvtxMemRegionRef_t region; + +} nvtxMemPermissionsAssignRegionDesc_v1 ; +typedef nvtxMemPermissionsAssignRegionDesc_v1 nvtxMemPermissionsAssignRegionDesc_t; + + +typedef struct nvtxMemPermissionsAssignBatch_v1 +{ + uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ + uint16_t structSize; /* Size of the structure. */ + + uint32_t reserved0; + + nvtxMemPermissionsHandle_t permissions; + + size_t regionCount; + size_t regionElementSize; + nvtxMemPermissionsAssignRegionDesc_t const* regionElements; + + size_t reserved1; +} nvtxMemPermissionsAssignBatch_v1 ; +typedef nvtxMemPermissionsAssignBatch_v1 nvtxMemPermissionsAssignBatch_t; + + + /** \brief Change the permissions of a region of process virtual memory. */ +NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsAssign( + nvtxDomainHandle_t domain, + nvtxMemPermissionsAssignBatch_t const* desc); + + +/** + * \brief Create a permissions object for fine grain thread-local control in + * multi-threading scenarios + * + * Unlike the global permissions object (NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE), a new + * permissions object is empty. There are no regions registered to it, so more memory is accessible + * if bound(bind) without calls to nvtxMemPermissionsSetAccess* first. The permissions are not + * active until nvtxMemPermissionsBind. See `nvtxMemPermissionsBind` for more details. + * + * Use the flags NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_* to control how the regions in + * this permission object will interact with global permissions when bound. You may choose to + * either replace global memory regions setting or overlay on top of them. The most common uses are + * as follows: + * * To limit tools to validate writing exclusively specified in this object but inherit all + * global read access regions use `NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE` + * * To limit tools to validate both read & write permissions exclusively specified in this + * object use NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_READ + * & NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE + * + * Also see `nvtxMemPermissionsBind` & `nvtxMemPermissionsSetAccess*`. + */ +NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemPermissionsCreate( + nvtxDomainHandle_t domain, + int32_t creationflags); /* NVTX_MEM_PERMISSIONS_CREATE_FLAGS_* */ + +/** + * \brief Destroy the permissions object. + * + * If bound(bind), destroy will also unbind it. + */ +NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsDestroy( + nvtxDomainHandle_t domain, + nvtxMemPermissionsHandle_t permissionsHandle); /* only supported on objects from nvtxMemPermissionsCreate */ + +/** \brief Reset the permissions object back to its created state. */ +NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsReset( + nvtxDomainHandle_t domain, + nvtxMemPermissionsHandle_t permissionsHandle); +/* NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE and other special handles are supported */ + + +#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_NONE 0x0 + + /** \brief Upon binding, with the thread, exclude parent scope write regions instead of overlaying on top of them. + * + * EX A developer may chose to first prevent all writes except the ones specified to avoid + * OOB writes, since there are typically less regions written to than read from. + **/ +#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_WRITE 0x2 + + /** \brief Upon binding, with the thread, exclude parent scope read regions instead of overlaying on top of them. + * + * EX After eliminating any errors when applying strict writes, a developer may then choose to + * annotate and enforce strict reads behaviors in segments of code. + **/ +#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_READ 0x1 + + /** \brief Upon binding, with the thread, exclude parent scope atomic RW regions instead of overlaying on top of them. + * + * EX After eliminating any errors from read and write, a developer may chose to ensure + * that atomics are in their own region, removing standard read/write, and replacing with + * this strict atomic only access. This way they know that conventional reads or writes + * will not cause unepected issues. + **/ +#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_ATOMIC 0x4 + + +#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_UNKNOWN 0x0 + + /** \brief Bind to thread scope. In this case, tools should validate that local thread's + * execution is honoring the permissions as well as the state of NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE + * at the time of binding. If this is not bound then NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE should be + * used to validate the memory. + * + * Not all tools will support every scope, such a GPU sanitizer. + **/ +#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_CPU_THREAD 0x1 + +/** + * \brief Bind to CUDA stream scope. + * + * In this case, work enqueued to a CUDA stream should be validated by the tool, + * when it executes, that it respect the permission of the permission at the point + * of binding, as well as the appropriate nvtxMemCudaGetDevicePermissions at the + * time of binding. If this is not bound then nvtxMemCudaGetDevicePermissions at + * the time of stream enqueue should be used to validate the memory. + * + * This could apply to work done either on the GPU like a kernel launch or to + * CPU based callbacks like cudaStreamAddCallback if the tools supports it. + * + * Binding is applies locally to a CPU thread so that if N CPU threads are enqueing + * work to the same stream (like the default stream) that there cannot be a race + * condition between thread binding vs launching their work. IE users should + * expect the permissions bound in the thread to be honored by the proceeding + * work (launches, copies, etc) invoked from in the CPU thread until unbound. + */ +#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM 0x2 + + +/** + * \brief Bind the permissions object into a particular scope on the caller thread + * + * Permissions do not take affect until binding. Binding permissions is a thread local + * activity that overrides global behaviors. This is to avoid multi-threaded race conditions, + * + * The scope dictates what type of processing it applies to, and when in some cases. + * EX1: NVTX_MEM_PERMISSIONS_BIND_SCOPE_CPU_THREAD applies to CPU code accessing memory while bound. + * EX2: NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM applies to CUDA streams, and the permissions + * must be recorded and applied when the work in the stream dequeues to executes. In this case + * it could be GPU or CPU, if the tool support both. + * + * Bind can be called again on the same object and thread to take any updates to the + * specified permission object or the inherited properties. + * + * Bind flags support changing how the binding process inherits region access control. + * In the case of thread scope this is NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE and from CUDA_STREAM + * this is nvtxMemCudaGetDevicePermissions. Choosing stricter modes allows the user to + * further reduce the access with less work, since memory by default, behaves as natural + * until the NVTX annotations instructs a tool to treat it anther way. See strict flags + * for more details. + * + * Also see nvtxMemPermissionsUnbind + */ +NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsBind( + nvtxDomainHandle_t domain, + nvtxMemPermissionsHandle_t permissions, /* special object like NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE are not supported */ + uint32_t bindScope, /* NVTX_MEM_PERMISSIONS_BIND_SCOPE_* */ + uint32_t bindFlags); /* NVTX_MEM_PERMISSIONS_BIND_FLAGS_* */ + +/** + * \brief Unbind the permissions object bound to the caller thread. + * + * Upon unbind, the thread local permissions for a scope are restored to the default + * behavior defined by the scope. + */ +NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsUnbind( + nvtxDomainHandle_t domain, + uint32_t bindScope); + +/** @} */ /*END defgroup*/ + +typedef enum NvtxExtMemCallbackId +{ + /* CBID 0 is invalid */ + NVTX3EXT_CBID_nvtxMemHeapRegister = 1, + NVTX3EXT_CBID_nvtxMemHeapUnregister = 2, + NVTX3EXT_CBID_nvtxMemHeapReset = 3, + NVTX3EXT_CBID_nvtxMemRegionsRegister = 4, + NVTX3EXT_CBID_nvtxMemRegionsResize = 5, + NVTX3EXT_CBID_nvtxMemRegionsUnregister = 6, + NVTX3EXT_CBID_nvtxMemRegionsName = 7, + NVTX3EXT_CBID_nvtxMemPermissionsAssign = 8, + NVTX3EXT_CBID_nvtxMemPermissionsCreate = 9, + NVTX3EXT_CBID_nvtxMemPermissionsDestroy = 10, + NVTX3EXT_CBID_nvtxMemPermissionsReset = 11, + NVTX3EXT_CBID_nvtxMemPermissionsBind = 12, + NVTX3EXT_CBID_nvtxMemPermissionsUnbind = 13, + + /* 14-16 in nvtExtImplMemCudaRt1.h */ + NVTX3EXT_CBID_nvtxMemCudaGetProcessWidePermissions = 14, + NVTX3EXT_CBID_nvtxMemCudaGetDeviceWidePermissions = 15, + NVTX3EXT_CBID_nvtxMemCudaSetPeerAccess = 16, + + NVTX3EXT_CBID_MEM_FN_NUM = 17 +} NvtxExtMemCallbackId; + +#ifdef __GNUC__ +#pragma GCC visibility push(internal) +#endif + +/* Extension types are required for the implementation and the NVTX handler. */ +#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot be included directly */ +#include "nvtxDetail/nvtxExtTypes.h" +#undef NVTX_EXT_TYPES_GUARD + +#ifndef NVTX_NO_IMPL +/* Ensure other headers cannot be included directly */ +#define NVTX_EXT_IMPL_MEM_GUARD +#include "nvtxDetail/nvtxExtImplMem_v1.h" +#undef NVTX_EXT_IMPL_MEM_GUARD +#endif /*NVTX_NO_IMPL*/ + +#ifdef __GNUC__ +#pragma GCC visibility pop +#endif + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* NVTOOLSEXTV3_MEM_V1 */ diff --git a/src/include/nvtx3/nvToolsExtMemCudaRt.h b/src/include/nvtx3/nvToolsExtMemCudaRt.h new file mode 100644 index 000000000..2b374bff9 --- /dev/null +++ b/src/include/nvtx3/nvToolsExtMemCudaRt.h @@ -0,0 +1,150 @@ +/* +* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ +#ifndef NVTOOLSEXTV3_MEM_CUDART_V1 +#define NVTOOLSEXTV3_MEM_CUDART_V1 + +#include "nvToolsExtMem.h" + +#include "cuda.h" +#include "cuda_runtime.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + + +/** \brief The memory is from a CUDA runtime array. + * + * Relevant functions: cudaMallocArray, cudaMalloc3DArray + * Also cudaArray_t from other types such as cudaMipmappedArray_t + * + * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported + * + * nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo() + * nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCudaArrayRangeDesc_t + */ +#define NVTX_MEM_TYPE_CUDA_ARRAY 0x11 + +/** \brief structure to describe memory in a CUDA array object + */ +typedef struct nvtxMemCudaArrayRangeDesc_v1 +{ + uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ + uint16_t structSize; /* Size of the structure. */ + uint32_t reserved0; + cudaArray_t src; + size_t offset[3]; + size_t extent[3]; +} nvtxMemCudaArrayRangeDesc_v1; +typedef nvtxMemCudaArrayRangeDesc_v1 nvtxMemCudaArrayRangeDesc_t; + + +/** \brief The memory is from a CUDA device array. + * + * Relevant functions: cuArrayCreate, cuArray3DCreate + * Also CUarray from other types such as CUmipmappedArray + * + * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported + * + * nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo() + * nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCuArrayRangeDesc_t + */ +#define NVTX_MEM_TYPE_CU_ARRAY 0x12 + +/** \brief structure to describe memory in a CUDA array object + */ +typedef struct nvtxMemCuArrayRangeDesc_v1 +{ + uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */ + uint16_t structSize; /* Size of the structure. */ + uint32_t reserved0; + CUarray src; + size_t offset[3]; + size_t extent[3]; +} nvtxMemCuArrayRangeDesc_v1; +typedef nvtxMemCuArrayRangeDesc_v1 nvtxMemCuArrayRangeDesc_t; + +/* Reserving 0x2-0xF for more common types */ + +#define NVTX_MEM_CUDA_PEER_ALL_DEVICES -1 + +/** \brief Get the permission object that represent the CUDA runtime device + * or cuda driver context + * + * This object will allow developers to adjust permissions applied to work executed + * on the GPU. It may be inherited or overridden by permissions object bound + * with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags. + * + * Ex. change the peer to peer access permissions between devices in entirety + * or punch through special holes + * + * By default, all memory is accessible that naturally would be to a CUDA kernel until + * modified otherwise by nvtxMemCudaSetPeerAccess or changing regions. + * + * This object should also represent the CUDA driver API level context. +*/ +NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetProcessWidePermissions( + nvtxDomainHandle_t domain); + +/** \brief Get the permission object that represent the CUDA runtime device + * or cuda driver context + * + * This object will allow developers to adjust permissions applied to work executed + * on the GPU. It may be inherited or overridden by permissions object bound + * with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags. + * + * Ex. change the peer to peer access permissions between devices in entirety + * or punch through special holes + * + * By default, all memory is accessible that naturally would be to a CUDA kernel until + * modified otherwise by nvtxMemCudaSetPeerAccess or changing regions. + * + * This object should also represent the CUDA driver API level context. +*/ +NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetDeviceWidePermissions( + nvtxDomainHandle_t domain, + int device); + +/** \brief Change the default behavior for all memory mapped in from a particular device. + * + * While typically all memory defaults to readable and writable, users may desire to limit + * access to reduced default permissions such as read-only and a per-device basis. + * + * Regions can used to further override smaller windows of memory. + * + * devicePeer can be NVTX_MEM_CUDA_PEER_ALL_DEVICES + * +*/ +NVTX_DECLSPEC void NVTX_API nvtxMemCudaSetPeerAccess( + nvtxDomainHandle_t domain, + nvtxMemPermissionsHandle_t permissions, + int devicePeer, /* device number such as from cudaGetDevice() or NVTX_MEM_CUDA_PEER_ALL_DEVICES */ + uint32_t flags); /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */ + +/** @} */ /*END defgroup*/ + +#ifdef __GNUC__ +#pragma GCC visibility push(internal) +#endif + +#ifndef NVTX_NO_IMPL +#define NVTX_EXT_IMPL_MEM_CUDART_GUARD /* Ensure other headers cannot be included directly */ +#include "nvtxDetail/nvtxExtImplMemCudaRt_v1.h" +#undef NVTX_EXT_IMPL_MEM_CUDART_GUARD +#endif /*NVTX_NO_IMPL*/ + +#ifdef __GNUC__ +#pragma GCC visibility pop +#endif + + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* NVTOOLSEXTV3_MEM_CUDART_V1 */ diff --git a/src/include/nvtx3/nvToolsExtOpenCL.h b/src/include/nvtx3/nvToolsExtOpenCL.h index a7b8a19b0..7b40b4115 100644 --- a/src/include/nvtx3/nvToolsExtOpenCL.h +++ b/src/include/nvtx3/nvToolsExtOpenCL.h @@ -30,11 +30,11 @@ extern "C" { */ /* ------------------------------------------------------------------------- */ -/* \cond SHOW_HIDDEN +/* \cond SHOW_HIDDEN * \brief Used to build a non-colliding value for resource types separated class * \version \NVTX_VERSION_2 */ -#define NVTX_RESOURCE_CLASS_OPENCL 6 +#define NVTX_RESOURCE_CLASS_OPENCL 6 /** \endcond */ /* ------------------------------------------------------------------------- */ @@ -183,7 +183,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name) #endif /* __cplusplus */ #ifndef NVTX_NO_IMPL -#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */ +#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxImplOpenCL_v3.h" #undef NVTX_IMPL_GUARD_OPENCL #endif /*NVTX_NO_IMPL*/ diff --git a/src/include/nvtx3/nvToolsExtPayload.h b/src/include/nvtx3/nvToolsExtPayload.h index a46c833e2..c775738b1 100644 --- a/src/include/nvtx3/nvToolsExtPayload.h +++ b/src/include/nvtx3/nvToolsExtPayload.h @@ -1,5 +1,5 @@ /* -* Copyright 2021-2022 NVIDIA Corporation. All rights reserved. +* Copyright 2021-2024 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. @@ -8,34 +8,41 @@ #include "nvToolsExt.h" -#ifndef NVTOOLSEXT_PAYLOAD_H -#define NVTOOLSEXT_PAYLOAD_H +/* Optionally include helper macros. */ +/* #include "nvToolsExtPayloadHelper.h" */ -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ +/** + * If needed, semantic extension headers can be included after this header. + */ /** - * \brief A compatibility ID value used in initialization to identify version - * differences. + * \brief The compatibility ID is used for versioning of this extension. */ -#define NVTX_EXT_COMPATID_PAYLOAD 0x0103 +#ifndef NVTX_EXT_PAYLOAD_COMPATID +#define NVTX_EXT_PAYLOAD_COMPATID 0x0103 +#endif /** - * \brief This module ID identifies the payload extension. It has to be unique + * \brief The module ID identifies the payload extension. It has to be unique * among the extension modules. */ -#define NVTX_EXT_MODULEID_PAYLOAD 2 +#ifndef NVTX_EXT_PAYLOAD_MODULEID +#define NVTX_EXT_PAYLOAD_MODULEID 2 +#endif /** - * \brief Additional values for the enum @ref nvtxPayloadType_t + * \brief Additional value for the enum @ref nvtxPayloadType_t */ -#define NVTX_PAYLOAD_TYPE_BINARY ((int32_t)0xDFBD0009) - +#ifndef NVTX_PAYLOAD_TYPE_EXT +#define NVTX_PAYLOAD_TYPE_EXT ((int32_t)0xDFBD0009) +#endif /** --------------------------------------------------------------------------- - * Payload schema entry flags. + * Payload schema entry flags. Used for @ref nvtxPayloadSchemaEntry_t::flags. * ------------------------------------------------------------------------- */ +#ifndef NVTX_PAYLOAD_ENTRY_FLAGS_V1 +#define NVTX_PAYLOAD_ENTRY_FLAGS_V1 + #define NVTX_PAYLOAD_ENTRY_FLAG_UNUSED 0 /** @@ -56,37 +63,79 @@ extern "C" { /** * The value is an array with fixed length, set with the field `arrayLength`. */ -#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE (1 << 4) +#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE (1 << 4) /** * The value is a zero-/null-terminated array. */ -#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED (2 << 4) +#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED (2 << 4) /** * \brief A single or multi-dimensional array of variable length. * - * The field `arrayLength` contains the index of the schema entry that holds the - * length(s). If the other field points to a scalar entry then this will be the - * 1D array. If the other field points to a FIXED_SIZE array, then the number of - * dimensions is defined with the registration of the scheme. If the other field - * is ZERO_TERMINATED, the array the dimensions can be determined at runtime. + * The field `arrayOrUnionDetail` contains the index of the schema entry that + * holds the length(s). If the length entry is a scalar, then this entry is a 1D + * array. If the length entry is a fixed-size array, then the number of + * dimensions is defined with the registration of the schema. If the length + * entry is a zero-terminated array, then the array of the dimensions can be + * determined at runtime. + * For multidimensional arrays, values are stored in row-major order, with rows + * being stored consecutively in contiguous memory. The size of the entry (in + * bytes) is the product of the dimensions multiplied with size of the array + * element. */ -#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX (3 << 4) +#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX (3 << 4) /** + * \brief A single or multi-dimensional array of variable length, where the + * dimensions are stored in a different payload (index) of the same event. + * + * This enables an existing address to an array to be directly passed, while the + * dimensions are defined in a separate payload (with only one payload entry). + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_PAYLOAD_INDEX (4 << 4) + +/** + * \brief The value or data that is pointed to by this payload entry value shall + * be copied by the NVTX handler. + * * A tool may not support deep copy and just ignore this flag. * See @ref NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY for more details. */ -#define NVTX_PAYLOAD_ENTRY_FLAG_DEEP_COPY (1 << 9) +#define NVTX_PAYLOAD_ENTRY_FLAG_DEEP_COPY (1 << 8) + +/** + * Notifies the NVTX handler to hide this entry in case of visualization. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_HIDE (1 << 9) + +/** + * The entry specifies the event message. Any string type can be used. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE (1 << 10) /** - * The entry specifies the message in a deferred event. The entry type can be - * any string type. The flag is ignored for schemas that are not flagged with - * `NVTX_PAYLOAD_SCHEMA_FLAG_RANGE*` or `NVTX_PAYLOAD_SCHEMA_FLAG_MARK`. + * \brief The entry contains an event timestamp. + * + * The time source might be provided via the entry semantics field. In most + * cases, the timestamp (entry) type is @ref NVTX_PAYLOAD_ENTRY_TYPE_UINT64. */ -#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE (1 << 10) +#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP (2 << 10) +/** + * These flags specify the NVTX event type to which an entry refers. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN (1 << 12) +#define NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END (2 << 12) +#define NVTX_PAYLOAD_ENTRY_FLAG_MARK (3 << 12) +#define NVTX_PAYLOAD_ENTRY_FLAG_COUNTER (4 << 12) + +#endif /* NVTX_PAYLOAD_ENTRY_FLAGS_V1 */ +/** --------------------------------------------------------------------------- + * END: Payload schema entry flags. + * ------------------------------------------------------------------------- */ + +/** \todo: Keep this in the header? */ /** * @note The ‘array’ flags assume that the array is embedded. Otherwise, * @ref NVTX_PAYLOAD_ENTRY_FLAG_POINTER has to be additionally specified. Some @@ -103,11 +152,14 @@ extern "C" { NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED | \ NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX) +#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_TYPE(F) \ + (F & NVTX_PAYLOAD_ENTRY_FLAG_IS_ARRAY) +/** \todo end */ + + /** --------------------------------------------------------------------------- * Types of entries in a payload schema. - * ------------------------------------------------------------------------- */ - -/** + * * @note Several of the predefined types contain the size (in bits) in their * names. For some data types the size (in bytes) is not fixed and may differ * for different platforms/operating systems/compilers. To provide portability, @@ -116,9 +168,11 @@ extern "C" { * is passed to the NVTX extension initialization function * @ref InitializeInjectionNvtxExtension via the `extInfo` field of * @ref nvtxExtModuleInfo_t. - */ + * ------------------------------------------------------------------------- */ +#ifndef NVTX_PAYLOAD_ENTRY_TYPES_V1 +#define NVTX_PAYLOAD_ENTRY_TYPES_V1 -#define NVTX_PAYLOAD_ENTRY_TYPE_INVALID 0 +#define NVTX_PAYLOAD_ENTRY_TYPE_INVALID 0 /** * Basic integer types. @@ -147,14 +201,14 @@ extern "C" { #define NVTX_PAYLOAD_ENTRY_TYPE_UINT64 18 /** - * C floating point types + * Floating point types */ #define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT 19 #define NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE 20 #define NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE 21 /** - * Size type (`size_t`) + * Size type (`size_t` in C). */ #define NVTX_PAYLOAD_ENTRY_TYPE_SIZE 22 @@ -179,7 +233,7 @@ extern "C" { /** * Store raw 8-bit binary data. As with `char`, 1-byte alignment is assumed. - * Typically a tool will display this as hex or binary. + * Typically, a tool will display this as hex or binary. */ #define NVTX_PAYLOAD_ENTRY_TYPE_BYTE 32 @@ -201,36 +255,37 @@ extern "C" { #define NVTX_PAYLOAD_ENTRY_TYPE_TF32 52 /** - * These types are normalized numbers stored in integers. UNORMs represent 0.0 - * to 1.0 and SNORMs represent -1.0 to 1.0. The number after represents the - * number of integer bits. Alignment is take from equivalent types INT# matching - * to SNORM# and UINT# matching to UNORM#. + * Data types are as defined by NVTXv3 core. */ -#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM8 61 -#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM8 62 -#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM16 63 -#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM16 64 -#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM32 65 -#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM32 66 -#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM64 67 -#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM64 68 +#define NVTX_PAYLOAD_ENTRY_TYPE_CATEGORY 68 /* uint32_t */ +#define NVTX_PAYLOAD_ENTRY_TYPE_COLOR_ARGB 69 /* uint32_t */ /** - * String types. - * - * If `arrayOrUnionDetail` is greater than `0`, the entry is a fixed-size string - * with the provided length. + * The scope of events or counters (see `nvtxScopeRegister`). + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_SCOPE_ID 70 /* uint64_t */ + +/** + * Thread ID as scope. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT32 73 +#define NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT64 74 + +/** + * \brief String types. * - * `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE` is ignored for string types. It - * just specifies once more that the entry is a fixed-size string. + * If no flags are set for the entry and `arrayOrUnionDetail > 0`, the entry is + * assumed to be a fixed-size string with the given length, embedded in the payload. + * `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE` is redundant for fixed-size strings. * - * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED` indicates a - * zero-terminated string. If `arrayOrUnionDetail` is greater than `0`, a zero- - * terminated array of fixed-size strings is assumed. + * \todo(Revise the following paragraph.) + * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED` specifies a + * zero-terminated string. If `arrayOrUnionDetail > 0`, the entry is handled as + * a zero-terminated array of fixed-size strings. * - * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX` specifies the - * entry index of the entry which contains the string length. It is not possible - * to describe a variable length array of strings. + * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX` specifies a + * variable-length string with the length given in the entry specified by the + * field `arrayOrUnionDetail`. */ #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING 75 /* `char*`, system LOCALE */ #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF8 76 @@ -238,93 +293,194 @@ extern "C" { #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF32 78 /** - * @ref nvtxStringHandle_t returned by @ref nvtxDomainRegisterString + * The entry value is of type @ref nvtxStringHandle_t returned by + * @ref nvtxDomainRegisterString. */ #define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE 80 -/** - * Entry types to be used in deferred events. Data types are as defined by - * NVTXv3 core: category -> uint32_t, color -> uint32_t, color type -> int32_t. - */ -#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_CATEGORY 90 -#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_COLORTYPE 91 -#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_COLOR 92 - /** * This type marks the union selector member (entry index) in schemas used by - * a union with internal internal selector. + * a union with internal selector. * See @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR. */ #define NVTX_PAYLOAD_ENTRY_TYPE_UNION_SELECTOR 100 /** - * Timestamp types occupy the range from 128 to 255 + * \brief Predefined schema ID for payload data that is referenced in another payload. + * + * This schema ID can be used in @ref nvtxPayloadData_t::schema_id to indicate that the + * payload is a blob of memory which other payload entries may point into. + * A tool will not expose this payload directly. + * + * This schema ID cannot be used as schema entry type! */ -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP64 128 /* data type is uint64_t */ +#define NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED 1022 + +/** + * \brief Predefined schema ID for raw payload data. + * + * This schema ID can be used in @ref nvtxPayloadData_t::schema_id to indicate + * that the payload is a blob, which can be shown with an arbitrary data viewer. + * This schema ID cannot be used as schema entry type! + */ +#define NVTX_TYPE_PAYLOAD_SCHEMA_RAW 1023 + +/** + * \deprecated: Remove for official release! + * In the initial version of this header custom schema IDs started + * here. Unless predefined types require more than 16 bits we can keep this + * value to preserve backwards compatibility. The value is not used as first + * ID for custom schemas any more, but in the analysis every entry type >= this + * value is assumed to be a custom schema. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_CUSTOM_BASE 65536 + +/* Custom (static) schema IDs. */ +#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START (1 << 24) + +/* Dynamic schema IDs (generated by the tool) start here. */ +#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START 4294967296 /* 1 << 32 */ + +#endif /* NVTX_PAYLOAD_ENTRY_TYPES_V1 */ +/** --------------------------------------------------------------------------- + * END: Payload schema entry types. + * ------------------------------------------------------------------------- */ + + +#ifndef NVTX_PAYLOAD_SCHEMA_TYPES_V1 +#define NVTX_PAYLOAD_SCHEMA_TYPES_V1 /** - * CPU timestamp sources. - * \todo All 64 bits? + * \brief The payload schema type. + * + * A schema can be either of the following types. It is set with + * @ref nvtxPayloadSchemaAttr_t::type. */ -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_TSC 129 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_TSC_NONVIRTUALIZED 130 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_REALTIME 131 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_REALTIME_COARSE 132 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC 133 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC_RAW 134 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC_COARSE 135 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_BOOTTIME 136 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_PROCESS_CPUTIME_ID 137 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_THREAD_CPUTIME_ID 138 +#define NVTX_PAYLOAD_SCHEMA_TYPE_INVALID 0 +#define NVTX_PAYLOAD_SCHEMA_TYPE_STATIC 1 +#define NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC 2 +#define NVTX_PAYLOAD_SCHEMA_TYPE_UNION 3 +#define NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR 4 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_QPC 160 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_GSTAFT 161 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_GSTAFTP 162 +#endif /* NVTX_PAYLOAD_SCHEMA_TYPES_V1 */ -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_TIME 163 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_CLOCK 164 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_TIMESPEC_GET 165 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_STEADY_CLOCK 166 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_HIGH_RESOLUTION_CLOCK 167 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_SYSTEM_CLOCK 168 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_UTC_CLOCK 169 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_TAI_CLOCK 170 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_GPS_CLOCK 171 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_FILE_CLOCK 172 +#ifndef NVTX_PAYLOAD_SCHEMA_FLAGS_V1 +#define NVTX_PAYLOAD_SCHEMA_FLAGS_V1 /** - * \brief GPU timestamp sources. + * \brief Flags for static and dynamic schemas. + * + * The schema flags are used with @ref nvtxPayloadSchemaAttr_t::flags. */ -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_GLOBALTIMER 192 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_SM_CLOCK 193 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_SM_CLOCK64 194 -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_CUPTI 195 +#define NVTX_PAYLOAD_SCHEMA_FLAG_NONE 0 /** - * The timestamp was provided by the NVTX handler’s timestamp routine. + * This flag indicates that a schema and the corresponding payloads can + * contain fields which require a deep copy. */ -#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_TOOL_PROVIDED 224 +#define NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY (1 << 1) /** - * This predefined schema ID can be used in `nvtxPayloadData_t` to indicate that - * the payload is a blob of memory which other payload entries may point into. - * A tool will not expose this payload directly. + * This flag indicates that a schema and the corresponding payload can be + * referenced by another payload of the same event. If the schema is not + * intended to be visualized directly, it is possible use + * @ref NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED instead. */ -#define NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED 1022 +#define NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED (1 << 2) /** - * This predefined schema ID can be used in `nvtxPayloadData_t` to indicate that - * the payload is a blob which can be shown with an arbitrary data viewer. + * The schema defines a counter group. An NVTX handler can expect that the schema + * contains entries with counter semantics. */ -#define NVTX_TYPE_PAYLOAD_SCHEMA_RAW 1023 +#define NVTX_PAYLOAD_SCHEMA_FLAG_COUNTER_GROUP (1 << 3) -/* Custom (static) schema IDs. */ -#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START (1 << 24) -/* Dynamic schema IDs (generated by the tool) start here. */ -#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START 4294967296 // 1 << 32 +#endif /* NVTX_PAYLOAD_SCHEMA_FLAGS_V1 */ + + +#ifndef NVTX_PAYLOAD_SCHEMA_ATTRS_V1 +#define NVTX_PAYLOAD_SCHEMA_ATTRS_V1 + +/** + * The values allow the valid fields in @ref nvtxPayloadSchemaAttr_t to be + * specified via setting the field `fieldMask`. + */ +#define NVTX_PAYLOAD_SCHEMA_ATTR_NAME (1 << 1) +#define NVTX_PAYLOAD_SCHEMA_ATTR_TYPE (1 << 2) +#define NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS (1 << 3) +#define NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES (1 << 4) +#define NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES (1 << 5) +#define NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE (1 << 6) +#define NVTX_PAYLOAD_SCHEMA_ATTR_ALIGNMENT (1 << 7) +#define NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID (1 << 8) +#define NVTX_PAYLOAD_SCHEMA_ATTR_EXTENSION (1 << 9) + +#endif /* NVTX_PAYLOAD_SCHEMA_ATTRS_V1 */ + + +#ifndef NVTX_PAYLOAD_ENUM_ATTRS_V1 +#define NVTX_PAYLOAD_ENUM_ATTRS_V1 + +/** + * The values are used to set the field `fieldMask` and specify which fields in + * @ref nvtxPayloadEnumAttr_t are set. + */ +#define NVTX_PAYLOAD_ENUM_ATTR_NAME (1 << 1) +#define NVTX_PAYLOAD_ENUM_ATTR_ENTRIES (1 << 2) +#define NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES (1 << 3) +#define NVTX_PAYLOAD_ENUM_ATTR_SIZE (1 << 4) +#define NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID (1 << 5) +#define NVTX_PAYLOAD_ENUM_ATTR_EXTENSION (1 << 6) +#endif /* NVTX_PAYLOAD_ENUM_ATTRS_V1 */ + +/** + * An NVTX scope specifies the execution scope or source of events or counters. + */ +#ifndef NVTX_SCOPES_V1 +#define NVTX_SCOPES_V1 + +/** Identifies an invalid scope and indicates an error if returned by `nvtxScopeRegister`. */ +#define NVTX_SCOPE_NONE 0 /* no scope */ + +#define NVTX_SCOPE_ROOT 1 + +#define NVTX_SCOPE_CURRENT_HW_MACHINE 2 /* Node/machine name */ +#define NVTX_SCOPE_CURRENT_HW_SOCKET 3 +#define NVTX_SCOPE_CURRENT_HW_CPU_PHYSICAL 4 /* Physical CPU core */ +#define NVTX_SCOPE_CURRENT_HW_CPU_LOGICAL 5 /* Logical CPU core */ +/* Innermost HW execution context at registration time */ +#define NVTX_SCOPE_CURRENT_HW_INNERMOST 15 + +/* Virtualized hardware, virtual machines, OS (if you don't know any better) +\todo: Need to be more precise what information is expected for each of these scopes. */ +#define NVTX_SCOPE_CURRENT_HYPERVISOR 16 +#define NVTX_SCOPE_CURRENT_VM 17 +#define NVTX_SCOPE_CURRENT_KERNEL 18 +#define NVTX_SCOPE_CURRENT_CONTAINER 19 +#define NVTX_SCOPE_CURRENT_OS 20 + +/* Software scopes */ +#define NVTX_SCOPE_CURRENT_SW_PROCESS 21 /* Process scope */ +#define NVTX_SCOPE_CURRENT_SW_THREAD 22 /* Thread scope */ +/* Innermost SW execution context at registration time */ +#define NVTX_SCOPE_CURRENT_SW_INNERMOST 31 + +/** Static (user-provided) scope IDs (feed forward) */ +#define NVTX_SCOPE_ID_STATIC_START (1 << 24) + +/** Dynamically (tool) generated scope IDs */ +#define NVTX_SCOPE_ID_DYNAMIC_START 4294967296 /* 1 << 32 */ + +#endif /* NVTX_SCOPES_V1 */ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#ifndef NVTX_PAYLOAD_TYPEDEFS_V1 +#define NVTX_PAYLOAD_TYPEDEFS_V1 /** * \brief Size and alignment information for predefined payload entry types. @@ -333,12 +489,64 @@ extern "C" { * array for the predefined types is passed via nvtxExtModuleInfo_t to the NVTX * client/handler. The type (ID) is used as index into this array. */ -typedef struct nvtxPayloadEntryTypeInfo_t +typedef struct nvtxPayloadEntryTypeInfo_v1 { uint16_t size; uint16_t align; } nvtxPayloadEntryTypeInfo_t; +/** + * \brief Binary payload data, size and decoding information. + * + * An array of type `nvtxPayloadData_t` is passed to the NVTX event attached to + * an NVTX event via the `payload.ullvalue` field of NVTX event attributes. + * + * The `schemaId` be a predefined schema entry type (`NVTX_PAYLOAD_ENTRY_TYPE*`), + * a schema ID (statically specified or dynamically created) or one of + * `NVTX_PAYLOAD_TYPE_REFERENCED` or `NVTX_PAYLOAD_TYPE_RAW`. + * + * Setting the size of a payload to `MAX_SIZE` can be useful to reduce the + * overhead of NVTX instrumentation, when no NVTX handler is attached. However, + * a tool might not be able to detect the size of a payload and thus skip it. + * A reasonable use case is a payload that represents a null-terminated + * C string, where the NVTX handler can call `strlen()`. + */ +typedef struct nvtxPayloadData_v1 +{ + /** + * The schema ID, which defines the layout of the binary data. + */ + uint64_t schemaId; + + /** + * Size of the payload (blob) in bytes. `SIZE_MAX` (`-1`) indicates the tool + * that it should figure out the size, which might not be possible. + */ + size_t size; + + /** + * Pointer to the binary payload data. + */ + const void* payload; +} nvtxPayloadData_t; + + +/** + * \brief Header of the payload entry's semantic field. + * + * If the semantic field of the payload schema entry is set, the first four + * fields (header) are defined with this type. A tool can iterate through the + * extensions and check, if it supports (can handle) it. + */ +typedef struct nvtxSemanticsHeader_v1 +{ + uint32_t structSize; /** Size of semantic extension struct. */ + uint16_t semanticId; + uint16_t version; + const struct nvtxSemanticsHeader_v1* next; /** linked list */ + /* Additional fields are defined by the specific semantic extension. */ +} nvtxSemanticsHeader_t; + /** * \brief Entry in a schema. * @@ -349,12 +557,12 @@ typedef struct nvtxPayloadEntryTypeInfo_t * and the offset is determined based on self-alignment rules. * * Example schema: - * nvtxPayloadSchemaEntry_t desc[] = { + * nvtxPayloadSchemaEntry_t schema[] = { * {0, NVTX_EXT_PAYLOAD_TYPE_UINT8, "one byte"}, * {0, NVTX_EXT_PAYLOAD_TYPE_INT32, "four bytes"} * }; */ -typedef struct nvtxPayloadSchemaEntry_t +typedef struct nvtxPayloadSchemaEntry_v1 { /** * \brief Flags to augment the basic type. @@ -365,37 +573,39 @@ typedef struct nvtxPayloadSchemaEntry_t uint64_t flags; /** - * \brief Predefined payload schema entry type or ID of a registered payload - * schema. + * \brief Predefined payload schema entry type or custom schema ID. + * + * Predefined types are `NVTX_PAYLOAD_ENTRY_TYPE_*`. Passing a schema ID + * enables nesting of schemas. */ uint64_t type; /** - * \brief Name of the payload entry. (Optional) + * \brief Name or label of the payload entry. (Optional) * - * Providing a name is useful to give a meaning to the associated value. + * A meaningful name or label can help organizing and interpreting the data. */ const char* name; /** * \brief Description of the payload entry. (Optional) + * + * A more detail description of the data that is stored with this entry. */ const char* description; /** - * \brief String or array length or union selector for union types. + * \brief String length, array length or member selector for union types. * - * If @ref type is a C string type, this defines the length of the string. + * If @ref type is a C string type, this field specifies the string length. * - * If @ref flags specify that the entry is an array, this field defines the - * length of the array. See `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_*` for more - * details. + * If @ref flags specify that the entry is an array, this field specifies + * the array length. See `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_*` for more details. * - * If @ref type implies that the entry is a union with schema type - * @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION (external selection of the union - * member), this field contains the index (starting with 0) to an entry of - * integer type in the same schema. The associated field contains the - * selected union member. + * If @ref type is a union with schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION + * (external selection of the union member), this field contains the index + * (starting with 0) to an entry of integral type in the same schema. The + * associated field value specifies the selected union member. * * @note An array of schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION is not * supported. @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR can @@ -407,176 +617,53 @@ typedef struct nvtxPayloadSchemaEntry_t * \brief Offset in the binary payload data (in bytes). * * This field specifies the byte offset from the base address of the actual - * binary data (blob) to the data of this entry. + * binary data (blob) to the start address of the data of this entry. + * + * It is recommended (but not required) to provide the offset it. Otherwise, + * the NVTX handler will determine the offset from natural alignment rules. + * In some cases, e.g. dynamic schema layouts, the offset cannot be set and + * has to be determined based on the data of prior entries. * - * This is an optional field, but it is recommended to specify this field to - * avoid issues in the automatic detection of the offset by a tool/handler. + * Setting the offset can also be used to skip entries during payload parsing. */ uint64_t offset; /** - * Semantics are not yet defined. + * \brief Additional semantics of the payload entry. + * + * The field points to the first element in a linked list, which enables + * multiple semantic extensions. */ - void* semantics; + const nvtxSemanticsHeader_t* semantics; /** - * Reserved for future use. Do not use it! + * \brief Reserved for future use. Do not use it! */ - void* reserved; + const void* reserved; } nvtxPayloadSchemaEntry_t; -/** - * \brief Binary payload data, size and decoding information. - * - * An array of nvtxPayloadData_t is passed to the NVTX event attribute payload - * member. To attach a single payload the macro @ref NVTX_EXT_PAYLOAD_SET_ATTR - * can be used. - */ -typedef struct nvtxPayloadData_t -{ - /** - * The schema ID, which defines the layout of the binary data. - */ - uint64_t schemaId; - - /** - * Size of the binary payload (blob) in bytes. - */ - size_t size; - - /** - * Pointer to the binary payload data. - */ - const void* payload; -} nvtxPayloadData_t; - -/* Helper macros for safe double-cast of pointer to uint64_t value */ -#ifndef NVTX_POINTER_AS_PAYLOAD_ULLVALUE -# ifdef __cplusplus -# define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) \ - static_cast(reinterpret_cast(p)) -# else -#define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) ((uint64_t)(uintptr_t)p) -# endif -#endif - - -#define NVTX_PAYLOAD_CONCAT2(a,b) a##b -#define NVTX_PAYLOAD_CONCAT(a,b) NVTX_PAYLOAD_CONCAT2(a,b) -#define NVTX_DATA_VAR NVTX_PAYLOAD_CONCAT(nvtxDFDB,__LINE__) - -/** - * \brief Helper macro to attach a single payload to an NVTX event attribute. - * - * @note The NVTX push, start or mark operation must not be in the same or a - * nested scope. - */ -#define NVTX_PAYLOAD_EVTATTR_SET(EVTATTR, SCHEMA_ID, PAYLOAD_ADDR, SIZE) \ - nvtxPayloadData_t NVTX_DATA_VAR[] = {{SCHEMA_ID, SIZE, PAYLOAD_ADDR}}; \ - (EVTATTR).payload.ullValue = \ - NVTX_POINTER_AS_PAYLOAD_ULLVALUE(NVTX_DATA_VAR); \ - (EVTATTR).payloadType = NVTX_PAYLOAD_TYPE_BINARY; \ - (EVTATTR).reserved0 = 1; - -/** - * \brief Helper macro to attach multiple payloads to an NVTX event attribute. - * - * The payload data array (`nvtxPayloadData_t`) is passed as first argument to - * this macro. - */ -#define NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE(EVTATTR, PAYLOADS) \ - (EVTATTR).payloadType = NVTX_PAYLOAD_TYPE_BINARY; \ - (EVTATTR).reserved0 = sizeof(PAYLOADS)/sizeof(nvtxPayloadData_t); \ - (EVTATTR).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(PAYLOADS); - /** - * \brief The payload schema type. - * - * A schema can be either of these types. + * \brief Header of the schema attribute extension field. */ -enum nvtxPayloadSchemaType +typedef struct nvtxPayloadSchemaExtension_v1 { - NVTX_PAYLOAD_SCHEMA_TYPE_INVALID = 0, - - NVTX_PAYLOAD_SCHEMA_TYPE_STATIC = 1, - NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC = 2, - - NVTX_PAYLOAD_SCHEMA_TYPE_UNION = 3, - NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR = 4 -}; - -/** - * \brief Flags for static and dynamic schemas. - */ -enum nvtxPayloadSchemaFlags -{ - NVTX_PAYLOAD_SCHEMA_FLAG_NONE = 0, - - /** - * This flag indicates that a schema and the corresponding payloads can - * contain fields which require a deep copy. - */ - NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY = (1 << 1), - - /** - * This flag indicates that a schema and the corresponding payloads can - * be referenced by another payload of the same event. - */ - NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED = (1 << 2), - - /** - * The schema describes a deferred event/marker. Such a schema requires one - * timestamp entry and one string entry with the flag - * `NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE`. Category and color can be - * optionally specified with the respective entry types. The deferred event - * can contain a binary payload itself by using a custom schema ID as type - * its schema description. Multiple occurrences of the same event can be - * described by specifying an array timestamps. - */ - NVTX_PAYLOAD_SCHEMA_FLAG_DEFERRED_EVENT = (1 << 3), - /** - * The schema describes a deferred event/marker. Such a schema requires - * one start timestamp, one end timestamp and one string entry with the flag - * `NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE`. Category and color can be - * optionally specified with the respective entry types. The deferred range - * can contain a binary payload itself by using a custom schema ID as type - * its schema description. - * - * Timestamps can be provided in different ways: - * - A single range has two timestamp entries with the first (smaller entry - * index) being used as the start/push timestamp. - * - If the range schema contains one array of timestamps, the tool assumes - * that the array contains alternating start and end timestamps. - * - If two timestamp arrays are specified the first entry (with the - * smaller entry index) is assumed to contain the start timestamps. Both - * arrays have to be of the same size. - */ - NVTX_PAYLOAD_SCHEMA_FLAG_DEFERRED_RANGE = (2 << 3) -}; + uint32_t structSize; /** Size of schema extension struct. */ + uint16_t schemaExtId; + uint16_t version; + const struct nvtxPayloadSchemaExtension_v1* next; /** linked list */ + /* Additional fields are defined by the specific schema extension. */ +} nvtxPayloadSchemaExtension_t; /** - * The values allow the valid fields in @ref nvtxPayloadSchemaAttr_t to be - * specified via setting the field `fieldMask`. + * \brief NVTX payload schema attributes. */ -#define NVTX_PAYLOAD_SCHEMA_ATTR_NAME (1 << 1) -#define NVTX_PAYLOAD_SCHEMA_ATTR_TYPE (1 << 2) -#define NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS (1 << 3) -#define NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES (1 << 4) -#define NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES (1 << 5) -#define NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE (1 << 6) -#define NVTX_PAYLOAD_SCHEMA_ATTR_ALIGNMENT (1 << 7) -#define NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID (1 << 8) - -/** - * NVTX payload schema attributes. - */ -typedef struct nvtxPayloadSchemaAttr_t +typedef struct nvtxPayloadSchemaAttr_v1 { /** - * \brief Mask of valid fields in this structure. + * \brief Mask of valid fields in this struct. * - * The values from `enum nvtxPayloadSchemaAttributes` have to be used. + * Use the `NVTX_PAYLOAD_SCHEMA_ATTR_*` defines. */ uint64_t fieldMask; @@ -588,14 +675,14 @@ typedef struct nvtxPayloadSchemaAttr_t /** * \brief Payload schema type. (Mandatory) \anchor PAYLOAD_TYPE_FIELD * - * A value from `enum nvtxPayloadSchemaType` has to be used. + * Use the `NVTX_PAYLOAD_SCHEMA_TYPE_*` defines. */ uint64_t type; /** * \brief Payload schema flags. (Optional) * - * Flags defined in `enum nvtxPayloadSchemaFlags` can be used to set + * Flags defined by `NVTX_PAYLOAD_SCHEMA_FLAG_*` can be used to set * additional properties of the schema. */ uint64_t flags; @@ -638,26 +725,23 @@ typedef struct nvtxPayloadSchemaAttr_t >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */ uint64_t schemaId; + + /* Flexible extension for schema attributes. */ + void* extension; } nvtxPayloadSchemaAttr_t; /** - * \brief Register a payload schema. + * \brief This type is used to describe an enumeration. * - * @param domain NVTX domain handle. - * @param attr NVTX payload schema attributes. - */ -NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadSchemaRegister( - nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr); - -/** - * \brief Enumeration entry. + * Since the value of an enum entry might not be meaningful for the analysis + * and/or visualization, a tool can show the name of enum entry instead. * - * Since the value of an enum entry might not be meaningful for the analysis, - * a tool can show the name of enum entry instead. + * An array of this struct is passed to @ref nvtxPayloadEnumAttr_t::entries to be + * finally registered via @ref nvtxPayloadEnumRegister with the NVTX handler. * * @note EXPERIMENTAL */ -typedef struct nvtxPayloadEnum_t +typedef struct nvtxPayloadEnum_v1 { /** * Name of the enum value. @@ -671,28 +755,20 @@ typedef struct nvtxPayloadEnum_t /** * Indicates that this entry sets a specific set of bits, which can be used - * to easily define bitsets. + * to define bitsets. */ int8_t isFlag; } nvtxPayloadEnum_t; /** - * The values are used to set the field `fieldMask` and specify which fields in - * `nvtxPayloadEnumAttr_t` are set. - */ -#define NVTX_PAYLOAD_ENUM_ATTR_NAME (1 << 1) -#define NVTX_PAYLOAD_ENUM_ATTR_ENTRIES (1 << 2) -#define NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES (1 << 3) -#define NVTX_PAYLOAD_ENUM_ATTR_SIZE (1 << 4) -#define NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID (1 << 5) - -/** - * NVTX payload enumeration type attributes. + * \brief NVTX payload enumeration type attributes. + * + * A pointer to this struct is passed to @ref nvtxPayloadEnumRegister. */ -typedef struct nvtxPayloadEnumAttr_t { +typedef struct nvtxPayloadEnumAttr_v1 +{ /** - * Mask of valid fields in this struct. - * The values from `enum nvtxPayloadSchemaAttributes` have to be used. + * Mask of valid fields in this struct. See `NVTX_PAYLOAD_ENUM_ATTR_*`. */ uint64_t fieldMask; @@ -722,17 +798,168 @@ typedef struct nvtxPayloadEnumAttr_t { * < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */ uint64_t schemaId; + + /* Flexible extension for enumeration attributes. */ + void* extension; } nvtxPayloadEnumAttr_t; +typedef struct nvtxScopeAttr_v1 +{ + size_t structSize; + + /** Path delimited by '/' characters, relative to parentScope. Leading + slashes are ignored. Nodes in the path may use name[key] syntax to indicate + an array of sibling nodes, which may be combined with other non-array nodes + or different arrays at the same scope. Node names should be UTF8 printable + characters, excluding '/', '[', and ']' characters which have special + meaning here. An empty C string "" and `NULL` are valid inputs and treated + equivalently. */ + const char* path; + + uint64_t parentScope; + + /** The static scope ID must be unique within the domain, + >= NVTX_EVENT_SCOPE_ID_STATIC_START, and + < NVTX_EVENT_SCOPE_ID_DYNAMIC_START. */ + uint64_t scopeId; +} nvtxScopeAttr_t; + + +#endif /* NVTX_PAYLOAD_TYPEDEFS_V1 */ + +#ifndef NVTX_PAYLOAD_API_FUNCTIONS_V1 +#define NVTX_PAYLOAD_API_FUNCTIONS_V1 + +/** + * \brief Register a payload schema. + * + * @param domain NVTX domain handle. + * @param attr NVTX payload schema attributes. + */ +NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadSchemaRegister( + nvtxDomainHandle_t domain, + const nvtxPayloadSchemaAttr_t* attr); + /** * \brief Register an enumeration type with the payload extension. * * @param domain NVTX domain handle * @param attr NVTX payload enumeration type attributes. */ -NVTX_DECLSPEC uint64_t nvtxPayloadEnumRegister(nvtxDomainHandle_t domain, +NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadEnumRegister( + nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr); +/** + * \brief Register a scope. + * + * @param domain NVTX domain handle (0 for default domain) + * @param attr Scope attributes. + * + * @return an identifier for the scope. If the operation was not successful, + * `NVTX_SCOPE_NONE` is returned. + */ +NVTX_DECLSPEC uint64_t NVTX_API nvtxScopeRegister( + nvtxDomainHandle_t domain, + const nvtxScopeAttr_t* attr); + +/** + * \brief Marks an instantaneous event in the application with the attributes + * being passed via the extended payload. + * + * An NVTX handler can assume that the payload contains the event message. + * Otherwise, it might ignore the event. + * + * @param domain NVTX domain handle + * @param payloadData pointer to an array of structured payloads. + * @param count number of payload BLOBs. + */ +NVTX_DECLSPEC void NVTX_API nvtxMarkPayload( + nvtxDomainHandle_t domain, + const nvtxPayloadData_t* payloadData, + size_t count); + +/** + * \brief Begin a nested thread range with the attributes being passed via the + * payload. + * + * @param domain NVTX domain handle + * @param payloadData pointer to an array of structured payloads. + * @param count number of payload BLOBs. + * + * @return The level of the range being ended. If an error occurs a negative + * value is returned on the current thread. + */ +NVTX_DECLSPEC int NVTX_API nvtxRangePushPayload( + nvtxDomainHandle_t domain, + const nvtxPayloadData_t* payloadData, + size_t count); + +/** + * \brief End a nested thread range with an additional custom payload. + * + * NVTX event attributes passed to this function (via the payloads) overwrite + * event attributes (message and color) that have been set in the push event. + * Other payload entries extend the data of the range. + * + * @param domain NVTX domain handle + * @param payloadData pointer to an array of structured payloads. + * @param count number of payload BLOBs. + * + * @return The level of the range being ended. If an error occurs a negative + * value is returned on the current thread. + */ +NVTX_DECLSPEC int NVTX_API nvtxRangePopPayload( + nvtxDomainHandle_t domain, + const nvtxPayloadData_t* payloadData, + size_t count); + +/** + * \brief Start a thread range with attributes passed via the extended payload. + * + * @param domain NVTX domain handle + * @param payloadData pointer to an array of structured payloads. + * @param count number of payload BLOBs. + * + * @return The level of the range being ended. If an error occurs a negative + * value is returned on the current thread. + */ +NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartPayload( + nvtxDomainHandle_t domain, + const nvtxPayloadData_t* payloadData, + size_t count); + +/** + * \brief End a thread range and pass a custom payload. + * + * NVTX event attributes passed to this function (via the payloads) overwrite + * event attributes (message and color) that have been set in the start event. + * Other payload entries extend the data of the range. + * + * @param domain NVTX domain handle + * @param id The correlation ID returned from a NVTX range start call. + * @param payloadData pointer to an array of structured payloads. + * @param count number of payload BLOBs. + */ +NVTX_DECLSPEC void NVTX_API nvtxRangeEndPayload( + nvtxDomainHandle_t domain, + nvtxRangeId_t id, + const nvtxPayloadData_t* payloadData, + size_t count); + +/** + * @brief Checks if an NVTX domain is enabled (unofficial and may not work) + * + * @param domain NVTX domain handle + * @return 0 if the domain is not enabled. + */ +NVTX_DECLSPEC uint8_t NVTX_API nvtxDomainIsEnabled( + nvtxDomainHandle_t domain); + +#endif /* NVTX_PAYLOAD_API_FUNCTIONS_V1 */ + +#ifndef NVTX_PAYLOAD_CALLBACK_ID_V1 +#define NVTX_PAYLOAD_CALLBACK_ID_V1 /** * \brief Callback Ids of API functions in the payload extension. * @@ -740,30 +967,130 @@ NVTX_DECLSPEC uint64_t nvtxPayloadEnumRegister(nvtxDomainHandle_t domain, * InitializeInjectionNvtxExtension(nvtxExtModuleInfo_t* moduleInfo) is * executed, a handler routine 'handlenvtxPayloadRegisterSchema' can be * registered as follows: + * \code{.c} * moduleInfo->segments->slots[NVTX3EXT_CBID_nvtxPayloadSchemaRegister] = - * (intptr_t)handlenvtxPayloadRegisterSchema; + * (intptr_t)YourPayloadRegisterSchemaHandlerFn; + * \endcode */ -typedef enum NvtxExtPayloadCallbackId -{ - NVTX3EXT_CBID_nvtxPayloadSchemaRegister = 0, - NVTX3EXT_CBID_nvtxPayloadEnumRegister = 1, - NVTX3EXT_CBID_PAYLOAD_FN_NUM = 2 -} NvtxExtPayloadCallbackId; +#define NVTX3EXT_CBID_nvtxPayloadSchemaRegister 0 +#define NVTX3EXT_CBID_nvtxPayloadEnumRegister 1 +#define NVTX3EXT_CBID_nvtxMarkPayload 2 +#define NVTX3EXT_CBID_nvtxRangePushPayload 3 +#define NVTX3EXT_CBID_nvtxRangePopPayload 4 +#define NVTX3EXT_CBID_nvtxRangeStartPayload 5 +#define NVTX3EXT_CBID_nvtxRangeEndPayload 6 +#define NVTX3EXT_CBID_nvtxDomainIsEnabled 7 +#define NVTX3EXT_CBID_nvtxScopeRegister 12 +#endif /* NVTX_PAYLOAD_CALLBACK_ID_V1 */ + +/*** Helper utilities ***/ + +/** \brief Helper macro for safe double-cast of pointer to uint64_t value. */ +#ifndef NVTX_POINTER_AS_PAYLOAD_ULLVALUE +# ifdef __cplusplus +# define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) \ + static_cast(reinterpret_cast(p)) +# else +#define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) ((uint64_t)(uintptr_t)p) +# endif +#endif + +#ifndef NVTX_PAYLOAD_EVTATTR_SET_DATA +/** + * \brief Helper macro to attach a single payload to an NVTX event attribute. + * + * @param evtAttr NVTX event attribute (variable name) + * @param pldata_addr Adress of `nvtxPayloadData_t` variable. + * @param schema_id NVTX binary payload schema ID. + * @param pl_addr Address of the (actual) payload. + * @param sz size of the (actual) payload. + */ +#define NVTX_PAYLOAD_EVTATTR_SET_DATA(evtAttr, pldata_addr, schema_id, pl_addr, sz) \ + (pldata_addr)->schemaId = schema_id; \ + (pldata_addr)->size = sz; \ + (pldata_addr)->payload = pl_addr; \ + (evtAttr).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(pldata_addr); \ + (evtAttr).payloadType = NVTX_PAYLOAD_TYPE_EXT; \ + (evtAttr).reserved0 = 1; +#endif /* NVTX_PAYLOAD_EVTATTR_SET_DATA */ + +#ifndef NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE +/** + * \brief Helper macro to attach multiple payloads to an NVTX event attribute. + * + * @param evtAttr NVTX event attribute (variable name) + * @param pldata Payload data array (of type `nvtxPayloadData_t`) + */ +#define NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE(evtAttr, pldata) \ + (evtAttr).payloadType = NVTX_PAYLOAD_TYPE_EXT; \ + (evtAttr).reserved0 = sizeof(pldata)/sizeof(nvtxPayloadData_t); \ + (evtAttr).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(pldata); +#endif /* NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE */ + +#ifndef NVTX_PAYLOAD_EVTATTR_SET +/* + * Do not use this macro directly! It is a helper to attach a single payload to + * an NVTX event attribute. + * @warning The NVTX push, start or mark operation must not be in an outer scope. + */ +#define NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schema_id, pl_addr, sz) \ + nvtxPayloadData_t _NVTX_PAYLOAD_DATA_VAR[] = \ + {{schema_id, sz, pl_addr}}; \ + (evtAttr)->payload.ullValue = \ + NVTX_POINTER_AS_PAYLOAD_ULLVALUE(_NVTX_PAYLOAD_DATA_VAR); \ + (evtAttr)->payloadType = NVTX_PAYLOAD_TYPE_EXT; \ + (evtAttr)->reserved0 = 1; +#endif /* NVTX_PAYLOAD_EVTATTR_SET */ + +#ifndef nvtxPayloadRangePush +/** + * \brief Helper macro to push a range with extended payload. + * + * @param domain NVTX domain handle (0 for default domain) + * @param evtAttr pointer to NVTX event attribute. + * @param schemaId NVTX payload schema ID + * @param plAddr Pointer to the binary data (actual payload) + * @param size Size of the binary payload data in bytes. + */ +#define nvtxPayloadRangePush(domain, evtAttr, schemaId, plAddr, size) \ +do { \ + NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schemaId, plAddr, size) \ + nvtxDomainRangePushEx(domain, evtAttr); \ +} while (0) +#endif /* nvtxPayloadRangePush */ + +#ifndef nvtxPayloadMark +/** + * \brief Helper macro to set a marker with extended payload. + * + * @param domain NVTX domain handle (0 for default domain) + * @param evtAttr pointer to NVTX event attribute. + * @param schemaId NVTX payload schema ID + * @param plAddr Pointer to the binary data (actual payload) + * @param size Size of the binary payload data in bytes. + */ +#define nvtxPayloadMark(domain, evtAttr, schemaId, plAddr, size) \ +do { \ + NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schemaId, plAddr, size) \ + nvtxDomainMarkEx(domain, evtAttr); \ +} while (0) +#endif /* nvtxPayloadMark */ + #ifdef __GNUC__ #pragma GCC visibility push(internal) #endif -#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot include directly */ -#include "nvtxExtDetail/nvtxExtTypes.h" +/* Extension types are required for the implementation and the NVTX handler. */ +#define NVTX_EXT_TYPES_GUARD +#include "nvtxDetail/nvtxExtTypes.h" #undef NVTX_EXT_TYPES_GUARD #ifndef NVTX_NO_IMPL -#define NVTX_EXT_IMPL_PAYLOAD_GUARD /* Ensure other headers cannot included directly */ -#include "nvtxExtDetail/nvtxExtPayloadTypeInfo.h" -#include "nvtxExtDetail/nvtxExtImplPayload_v1.h" +#define NVTX_EXT_IMPL_PAYLOAD_GUARD +#include "nvtxDetail/nvtxExtImplPayload_v1.h" #undef NVTX_EXT_IMPL_PAYLOAD_GUARD -#endif /*NVTX_NO_IMPL*/ +#endif /* NVTX_NO_IMPL */ #ifdef __GNUC__ #pragma GCC visibility pop @@ -772,5 +1099,3 @@ typedef enum NvtxExtPayloadCallbackId #ifdef __cplusplus } #endif /* __cplusplus */ - -#endif /* NVTOOLSEXT_PAYLOAD_H */ diff --git a/src/include/nvtx3/nvToolsExtPayloadHelper.h b/src/include/nvtx3/nvToolsExtPayloadHelper.h new file mode 100644 index 000000000..304d5d6a5 --- /dev/null +++ b/src/include/nvtx3/nvToolsExtPayloadHelper.h @@ -0,0 +1,170 @@ +/* +* Copyright 2023 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#include "nvtxDetail/nvtxExtPayloadHelperInternal.h" + + +/* This is just an empty marker (for readability), which can be omitted. */ +/* TODO: Fix issue with trailing comma at end of entry list. */ +#define NVTX_PAYLOAD_ENTRIES + + +/** + * Use this macro for payload entries that are defined by a schema (nested + * payload schema). + */ +#define NVTX_PAYLOAD_NESTED(schemaId) _NVTX_PAYLOAD_NESTED(schemaId) + + +/** + * \brief Define a payload schema for an existing C `struct` definition. + * + * This macro does + * 1) create schema description (array of schema entries). + * 2) set the schema attributes for a static data layout. + * + * It can be used in static code or within a function context. + * + * Example: + * NVTX_DEFINE_SCHEMA_FOR_STRUCT(your_struct, "SchemaName", + * NVTX_PAYLOAD_ENTRIES( + * (index, TYPE_INT, "integer value"), + * (dpfloat, TYPE_DOUBLE, "fp64 value"), + * (text, TYPE_CSTRING, "text", NULL, 24) + * ) + * ) + * + * It is required to at least provide the struct name and the payload entries. + * The first two fields (member name and NVTX entry type) of each payload entry + * are required. + * + * The optional parameters are only allowed to be passed in the predefined order. + * Hence, `payload_flags` requires `payload_schema` to be given and + * `prefix` requires `payload_flags` and `payload_schema` to be given. + * The payload entries are always the last parameter. A maximum of 16 schema + * entries is supported. + * + * It is recommended to use `NVTX_PAYLOAD_SCHEMA_REGISTER` to register the schema. + * + * @param struct_id The name of the struct. + * @param schema_name (Optional 1) name of the payload schema. Default is `NULL`. + * @param prefix (Optional 2) prefix before the schema and attributes variables, + * e.g. `static const`. Leave this empty, if no prefix is desired. + * @param schema_flags (Optional 2) flags to augment the payload schema. + * Default is `NVTX_PAYLOAD_SCHEMA_FLAG_NONE`. + * @param schema_id (Optional 4) User-defined payload schema ID. + * @param entries (Mandatory) Payload schema entries. This is always the last + * parameter to the macro. + */ +#define NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, ...) \ + _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, __VA_ARGS__) + + +/** + * \brief Define a C struct together with a matching schema. + * + * This macro does + * 1) define the payload type (typedef struct). + * 2) create schema description (array of schema entries). + * 3) set the schema attributes for a static data layout. + * + * The macro can be used in static code or within a function context. + * + * It defines the schema attributes in `struct_id##Attr`. Thus, it is recommended + * to use `NVTX_PAYLOAD_SCHEMA_REGISTER(domain, struct_id)` to register the schema. + * + * Example: + * NVTX_DEFINE_STRUCT_WITH_SCHEMA(your_struct_name, "Your schema name", + * NVTX_PAYLOAD_ENTRIES( + * (int, index, TYPE_INT, "integer value"), + * (double, dpfloat, TYPE_DOUBLE, "fp64 value"), + * (const char, (text, 24), TYPE_CSTRING, "text", NULL, 24) + * ) + * ) + * + * The first three fields (C type, member, entry type) of each entry are required. + * A fixed-size array or string requires a special notation with the member + * name and the size separated by comma and put into brackets (see last entry + * in the example). + * + * The optional parameters are positional (only allowed to be passed in the + * predefined order). A maximum of 16 schema entries is supported. + * + * @param struct_id The name of the struct. + * @param schema_name (Optional 1) name of the payload schema. Default is `NULL`. + * @param prefix (Optional 2) prefix before the schema and attributes variables, + * e.g. `static const`. Leave this empty, if no prefix is desired. + * @param schema_flags (Optional 3) flags to augment the payload schema. + * Default is `NVTX_PAYLOAD_SCHEMA_FLAG_NONE`. + * @param schema_id (Optional 4) User-defined payload schema ID. + * @param entries (Mandatory) The schema entries. This is always the last + * parameter to the macro. + */ +#define NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, ...) \ + _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, __VA_ARGS__) + +/** + * \brief Initialize and register the NVTX binary payload schema. + * + * This does essentially the same as `NVTX_DEFINE_STRUCT_WITH_SCHEMA`, but in + * addition the schema is registered. The schema ID will be defined as follows: + * `const uint64_t struct_id##_schemaId`. + * + * @param domain The NVTX domain handle (0 for default domain). + * All other parameters are similar to `NVTX_DEFINE_STRUCT_WITH_SCHEMA`. + */ +#define NVTX_DEFINE_STRUCT_WITH_SCHEMA_AND_REGISTER(domain, struct_id, ...) \ + _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, __VA_ARGS__) \ + const uint64_t struct_id##_schemaId = nvtxPayloadSchemaRegister(domain, &struct_id##Attr); + +/** + * \brief Define payload schema for an existing `struct` and register the schema. + * + * This does essentially the same as `NVTX_PAYLOAD_STATIC_SCHEMA_DEFINE`, but in + * addition, the schema is registered and `uint64_t struct_id##_schemaId` set. + * + * @param domain The NVTX domain handle (0 for default domain). + * All other parameters are similar to `NVTX_PAYLOAD_STATIC_SCHEMA_DEFINE`. + */ +#define NVTX_DEFINE_SCHEMA_FOR_STRUCT_AND_REGISTER(domain, struct_id, ...) \ + _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, __VA_ARGS__) \ + const uint64_t struct_id##_schemaId = nvtxPayloadSchemaRegister(domain, &struct_id##Attr); + +/** + * \brief Create a type definition for the given struct ID and members. + * + * This is a convenience macro. A normal `typedef` can be used instead. + * + * Example usage: + * NVTX_DEFINE_STRUCT(your_struct, + * (double, fp64), + * (uint8_t, u8), + * (float, fp32[3]) + * ) + * + * @param struct_id The name of the struct. + * @param members The members of the struct. + */ +#define NVTX_DEFINE_STRUCT(struct_id, ...) \ + _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, __VA_ARGS__) + +/** + * \brief Register an NVTX binary payload schema. + * + * This is a convenience macro, which takes the same `struct_id` that has been + * used in other helper macros. Instead, `nvtxPayloadSchemaRegister` can also be + * used, but `&struct_id##Attr` has to be passed. + * + * @param domain The NVTX domain handle (0 for default domain). + * @param struct_id The name of the struct. + * + * @return NVTX schema ID + */ +#define NVTX_PAYLOAD_SCHEMA_REGISTER(domain, struct_id) \ + nvtxPayloadSchemaRegister(domain, &struct_id##Attr); + diff --git a/src/include/nvtx3/nvToolsExtSemanticsCounters.h b/src/include/nvtx3/nvToolsExtSemanticsCounters.h new file mode 100644 index 000000000..f97624a07 --- /dev/null +++ b/src/include/nvtx3/nvToolsExtSemanticsCounters.h @@ -0,0 +1,88 @@ +/* +* Copyright 2024 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +/** + * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand. + */ + +#ifndef NVTX_SEMANTIC_ID_COUNTERS_V1 +#define NVTX_SEMANTIC_ID_COUNTERS_V1 2 + +/** + * Flags to extend the semantics of counters. + */ +#define NVTX_COUNTERS_FLAGS_NONE 0 + +/** + * Convert the fixed point value to a normalized floating point value. + * Unsigned [0f : 1f] or signed [-1f : 1f] is determined by the underlying type + * this flag is applied to. + */ +#define NVTX_COUNTERS_FLAG_NORMALIZE (1 << 1) + +/** + * Visual tools should apply scale and limits when graphing. + */ +#define NVTX_COUNTERS_FLAG_LIMIT_MIN (1 << 2) +#define NVTX_COUNTERS_FLAG_LIMIT_MAX (1 << 3) +#define NVTX_COUNTERS_FLAG_LIMITS \ + (NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX) + +/** + * Counter time scopes. + */ +#define NVTX_COUNTERS_FLAG_TIMESCOPE_POINT (1 << 5) +#define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_LAST (2 << 5) +#define NVTX_COUNTERS_FLAG_TIMESCOPE_UNTIL_NEXT (3 << 5) +#define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_START (4 << 5) + +/** + * Counter value types. + */ +#define NVTX_COUNTERS_FLAG_VALUETYPE_ABSOLUTE (1 << 10) +/** Delta to previous value of same counter type. */ +#define NVTX_COUNTERS_FLAG_VALUETYPE_DELTA (2 << 10) + +/** + * Datatypes for the `limits` union. + */ +#define NVTX_COUNTERS_LIMIT_I64 0 +#define NVTX_COUNTERS_LIMIT_U64 1 +#define NVTX_COUNTERS_LIMIT_F64 2 + +/** + *\brief Specify counter semantics. + */ +typedef struct nvtxSemanticsCounter_v1 { + /** Header of the semantic extensions (with identifier, version, etc.). */ + struct nvtxSemanticsHeader_v1 header; + + /** Flags to provide more context about the counter value. */ + uint64_t flags; + + /** Unit of the counter value (case-insensitive). */ + const char* unit; + + /** Should be 1 if not used. */ + uint64_t unitScaleNumerator; + + /** Should be 1 if not used. */ + uint64_t unitScaleDenominator; + + /** Determines the used union member. Use defines `NVTX_COUNTER_LIMIT_*`. */ + int64_t limitType; + + /** Graph limits {minimum, maximum}. */ + union limits_t { + int64_t i64[2]; + uint64_t u64[2]; + double d[2]; + } limits; +} nvtxSemanticsCounter_t; + +#endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */ \ No newline at end of file diff --git a/src/include/nvtx3/nvToolsExtSemanticsScope.h b/src/include/nvtx3/nvToolsExtSemanticsScope.h new file mode 100644 index 000000000..eed6f3095 --- /dev/null +++ b/src/include/nvtx3/nvToolsExtSemanticsScope.h @@ -0,0 +1,30 @@ +/* +* Copyright 2024 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +/** + * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand. + */ + +#ifndef NVTX_SEMANTIC_ID_SCOPE_V1 +#define NVTX_SEMANTIC_ID_SCOPE_V1 1 + +/** + * \brief Specify the NVTX scope for a payload entry. + * + * This allows the scope to be set for a specific value or counter in a payload. + * The scope must be known at schema registration time. + */ +typedef struct nvtxSemanticsScope_v1 +{ + struct nvtxSemanticsHeader_v1 header; + + /** Specifies the scope of a payload entry, e.g. a counter or timestamp. */ + uint64_t scopeId; +} nvtxSemanticsScope_t; + +#endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */ \ No newline at end of file diff --git a/src/include/nvtx3/nvToolsExtSync.h b/src/include/nvtx3/nvToolsExtSync.h index 113fcd191..6578516d4 100644 --- a/src/include/nvtx3/nvToolsExtSync.h +++ b/src/include/nvtx3/nvToolsExtSync.h @@ -15,23 +15,23 @@ extern "C" { #endif /* __cplusplus */ -/* \cond SHOW_HIDDEN +/* \cond SHOW_HIDDEN * \version \NVTX_VERSION_2 */ #define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) ) /** \endcond */ -/** +/** * \page PAGE_SYNCHRONIZATION Synchronization * * This section covers a subset of the API that allow users to track additional -* synchronization details of their application. Naming OS synchronization primitives -* may allow users to better understand the data collected by traced synchronization +* synchronization details of their application. Naming OS synchronization primitives +* may allow users to better understand the data collected by traced synchronization * APIs. Additionally, a user defined synchronization object can allow the users to * to tell the tools when the user is building their own synchronization system * that do not rely on the OS to provide behaviors and instead use techniques like -* atomic operations and spinlocks. +* atomic operations and spinlocks. * * See module \ref SYNCHRONIZATION for details. * @@ -59,7 +59,7 @@ extern "C" { * * bool Lock() { * nvtxDomainSyncUserAcquireStart(hSync); -* bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic +* bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic * if (acquired) { * nvtxDomainSyncUserAcquireSuccess(hSync); @@ -76,12 +76,12 @@ extern "C" { * } * }; * \endcode -* +* * \version \NVTX_VERSION_2 */ /* ------------------------------------------------------------------------- */ -/* \cond SHOW_HIDDEN +/* \cond SHOW_HIDDEN * \brief Used to build a non-colliding value for resource types separated class * \version \NVTX_VERSION_2 */ @@ -154,8 +154,8 @@ typedef struct nvtxSyncUser* nvtxSyncUser_t; /** \brief User Defined Synchronization Object Attributes Structure. * \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE * -* This structure is used to describe the attributes of a user defined synchronization -* object. The layout of the structure is defined by a specific version of the tools +* This structure is used to describe the attributes of a user defined synchronization +* object. The layout of the structure is defined by a specific version of the tools * extension library and can change between different versions of the Tools Extension * library. * @@ -259,7 +259,7 @@ typedef struct nvtxSyncUserAttributes_v0 typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t; /* ------------------------------------------------------------------------- */ -/** \brief Create a user defined synchronization object +/** \brief Create a user defined synchronization object * This is used to track non-OS synchronization working with spinlocks and atomics * * \param domain - Domain to own the resource @@ -317,7 +317,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle /* ------------------------------------------------------------------------- */ /** \brief Signal to tools of failure in acquiring a user defined synchronization object * This should be called after \ref nvtxDomainSyncUserAcquireStart -* +* * \param handle - A handle to the object to operate on. * * \sa @@ -374,7 +374,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle); #endif /* __cplusplus */ #ifndef NVTX_NO_IMPL -#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */ +#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot be included directly */ #include "nvtxDetail/nvtxImplSync_v3.h" #undef NVTX_IMPL_GUARD_SYNC #endif /*NVTX_NO_IMPL*/ diff --git a/src/include/nvtx3/nvtx3.hpp b/src/include/nvtx3/nvtx3.hpp index 8c62acd46..a2f46c37f 100644 --- a/src/include/nvtx3/nvtx3.hpp +++ b/src/include/nvtx3/nvtx3.hpp @@ -12,6 +12,11 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. + * + * Licensed under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ /* Temporary helper #defines, #undef'ed at end of header */ @@ -1937,9 +1942,9 @@ class event_attributes { 0, // color value NVTX_PAYLOAD_UNKNOWN, // payload type 0, // reserved 4B - 0, // payload value (union) + {0}, // payload value (union) NVTX_MESSAGE_UNKNOWN, // message type - 0 // message value (union) + {0} // message value (union) } { } @@ -2003,20 +2008,20 @@ class event_attributes { attributes_.messageType = m.get_type(); } - /** - * @brief Variadic constructor where the first argument is a binary payload. + /** + * @brief Variadic constructor where the first argument is an extended payload. * - * Sets the value of the `EventAttribute`s message based on `m` and forwards + * Sets the `ullValue` of the `EventAttribute`s payload and forwards * the remaining variadic parameter pack to the next constructor. * */ template - NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t const* bpl, Args const&... args) noexcept + NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t const* p, Args const&... args) noexcept : event_attributes(args...) { - attributes_.payloadType = NVTX_PAYLOAD_TYPE_BINARY; + attributes_.payloadType = NVTX_PAYLOAD_TYPE_EXT; attributes_.reserved0 = 1; // NCCL uses only a single binary payload per event. - attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(bpl); + attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p); } ~event_attributes() = default; diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h b/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h new file mode 100644 index 000000000..00fc81768 --- /dev/null +++ b/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h @@ -0,0 +1,31 @@ +/* +* Copyright 2023 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#ifndef NVTX_EXT_HELPER_MACROS_H +#define NVTX_EXT_HELPER_MACROS_H + +/* Combine tokens */ +#define _NVTX_EXT_CONCAT(a, b) a##b +#define NVTX_EXT_CONCAT(a, b) _NVTX_EXT_CONCAT(a, b) + +/* Resolves to the number of arguments passed. */ +#define NVTX_EXT_NUM_ARGS(...) \ + NVTX_EXT_SELECTA16(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, throwaway) +#define NVTX_EXT_SELECTA16(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, ...) a16 + +/* Cast argument(s) to void to prevent unused variable warnings. */ +#define _NVTX_EXT_VOIDIFY1(a1) (void)a1; +#define _NVTX_EXT_VOIDIFY2(a1, a2) (void)a1; (void)a2; +#define _NVTX_EXT_VOIDIFY3(a1, a2, a3) (void)a1; (void)a2; (void)a3; +#define _NVTX_EXT_VOIDIFY4(a1, a2, a3, a4) (void)a1; (void)a2; (void)a3; (void)a4; + +/* Mark function arguments as unused. */ +#define NVTX_EXT_HELPER_UNUSED_ARGS(...) \ + NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) + +#endif /* NVTX_EXT_HELPER_MACROS_H */ \ No newline at end of file diff --git a/src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h b/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h similarity index 79% rename from src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h rename to src/include/nvtx3/nvtxDetail/nvtxExtImpl.h index 5e4277805..79bb0c1c5 100644 --- a/src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h +++ b/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h @@ -14,7 +14,12 @@ #define NVTX_EXT_IMPL_H /* ---- Include required platform headers ---- */ -#if defined(_WIN32) +#include +#include +#include +#include + +#if defined(_WIN32) #include @@ -22,27 +27,19 @@ #include #if defined(__ANDROID__) -#include +#include #endif #if defined(__linux__) || defined(__CYGWIN__) #include #endif +#include #include #include #include -#include -#include -#include -#include #include - -#include -#include #include -#include -#include #endif @@ -66,26 +63,35 @@ #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ - -// #ifdef __GNUC__ -// #pragma GCC visibility push(hidden) -// #endif - +/* +#ifdef __GNUC__ +#pragma GCC visibility push(hidden) +#endif +*/ #define NVTX_EXTENSION_FRESH 0 #define NVTX_EXTENSION_DISABLED 1 #define NVTX_EXTENSION_STARTING 2 #define NVTX_EXTENSION_LOADED 3 -NVTX_LINKONCE_DEFINE_GLOBAL NvtxExtInitializeInjectionFunc_t NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = (NvtxExtInitializeInjectionFunc_t)0; +/* Function slots are local to each extension */ +typedef struct nvtxExtGlobals1_t +{ + NvtxExtInitializeInjectionFunc_t injectionFnPtr; +} nvtxExtGlobals1_t; + +NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) = +{ + (NvtxExtInitializeInjectionFunc_t)0 +}; #define NVTX_EXT_INIT_GUARD #include "nvtxExtInit.h" #undef NVTX_EXT_INIT_GUARD - -// #ifdef __GNUC__ -// #pragma GCC visibility pop -// #endif - +/* +#ifdef __GNUC__ +#pragma GCC visibility pop +#endif +*/ #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h b/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h new file mode 100644 index 000000000..0f6ff9667 --- /dev/null +++ b/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h @@ -0,0 +1,148 @@ +/* +* Copyright 2023-2024 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#ifndef NVTX_EXT_IMPL_COUNTERS_GUARD +#error Never include this file directly -- it is automatically included by nvToolsExtCounters.h (except when NVTX_NO_IMPL is defined). +#endif + +#define NVTX_EXT_IMPL_GUARD +#include "nvtxExtImpl.h" +#undef NVTX_EXT_IMPL_GUARD + +#ifndef NVTX_EXT_IMPL_COUNTERS_V1 +#define NVTX_EXT_IMPL_COUNTERS_V1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* Macros to create versioned symbols. */ +#define NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \ + NAME##_v##VERSION##_bpl##COMPATID +#define NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \ + NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) +#define NVTX_EXT_COUNTERS_VERSIONED_ID(NAME) \ + NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COUNTERS_COMPATID) + +#ifdef NVTX_DISABLE + +#include "nvtxExtHelperMacros.h" + +#define NVTX_EXT_COUNTERS_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \ +ret_val fn_name signature { \ + NVTX_EXT_HELPER_UNUSED_ARGS arg_names \ + return ((ret_val)(intptr_t)-1); \ +} + +#else /* NVTX_DISABLE */ + +/* + * Function slots for the counters extension. First entry is the module state, + * initialized to `0` (`NVTX_EXTENSION_FRESH`). + */ +#define NVTX_EXT_COUNTERS_SLOT_COUNT 63 +NVTX_LINKONCE_DEFINE_GLOBAL intptr_t +NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX_EXT_COUNTERS_SLOT_COUNT + 1] + = {0}; + +/* Avoid warnings about missing prototype. */ +NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)(void); +NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)() +{ + intptr_t* fnSlots = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots) + 1; + nvtxExtModuleSegment_t segment = { + 0, /* unused (only one segment) */ + NVTX_EXT_COUNTERS_SLOT_COUNT, + fnSlots + }; + + nvtxExtModuleInfo_t module = { + NVTX_VERSION, sizeof(nvtxExtModuleInfo_t), + NVTX_EXT_COUNTERS_MODULEID, NVTX_EXT_COUNTERS_COMPATID, + 1, &segment, /* number of segments, segments */ + NULL, /* no export function needed */ + /* bake type sizes and alignment information into program binary */ + NULL + }; + + NVTX_INFO( "%s\n", __FUNCTION__ ); + + NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module, + NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)); +} + +#define NVTX_EXT_COUNTERS_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \ +typedef ret_type (*fn_name##_impl_fntype)signature; \ + NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \ + intptr_t slot = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ + if (slot != NVTX_EXTENSION_DISABLED) { \ + if (slot != NVTX_EXTENSION_FRESH) { \ + return (*(fn_name##_impl_fntype)slot) arg_names; \ + } else { \ + NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)(); \ + /* Re-read function slot after extension initialization. */ \ + slot = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ + if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \ + return (*(fn_name##_impl_fntype)slot) arg_names; \ + } \ + } \ + } \ + NVTX_EXT_FN_RETURN_INVALID(ret_type) \ +} + +#endif /*NVTX_DISABLE*/ + +/* Non-void functions. */ +#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1); + +NVTX_EXT_COUNTERS_IMPL_FN_V1(nvtxCountersHandle_t, nvtxCountersRegister, + (nvtxDomainHandle_t domain, const nvtxCountersAttr_t* attr), + (domain, attr)) + +#undef NVTX_EXT_FN_RETURN_INVALID +/* END: Non-void functions. */ + +/* void functions. */ +#define NVTX_EXT_FN_RETURN_INVALID(rtype) +#define return + +NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleInt64, + (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, int64_t value), + (domain, hCounter, value)) + +NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleFloat64, + (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, double value), + (domain, hCounter, value)) + +NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSample, + (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, void* values, size_t size), + (domain, hCounter, values, size)) + +NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleNoValue, + (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, uint8_t reason), + (domain, hCounter, reason)) + +NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatch, + (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounters, + const void* counters, size_t size), (domain, hCounters, counters, size)) + +NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatchEx, + (nvtxDomainHandle_t domain, const nvtxCountersBatch_t* countersBatch), + (domain, countersBatch)) + +#undef return +#undef NVTX_EXT_FN_RETURN_INVALID +/* END: void functions. */ + +/* Keep NVTX_EXT_COUNTERS_IMPL_FN_V1 defined for a future version of this extension. */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* NVTX_EXT_IMPL_COUNTERS_V1 */ \ No newline at end of file diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h b/src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h new file mode 100644 index 000000000..5a5286df3 --- /dev/null +++ b/src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h @@ -0,0 +1,74 @@ +/* +* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#ifndef NVTX_EXT_IMPL_MEM_CUDART_GUARD +#error Never include this file directly -- it is automatically included by nvToolsExtMemCudaRt.h (except when NVTX_NO_IMPL is defined). +#endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#ifdef NVTX_DISABLE + +#include "nvtxExtHelperMacros.h" + +#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \ +ret_val fn_name signature { \ + NVTX_EXT_HELPER_UNUSED_ARGS arg_names \ + return ((ret_val)(intptr_t)-1); \ +} + +#else /* NVTX_DISABLE */ + +#define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \ +typedef ret_type ( * fn_name##_impl_fntype )signature; \ + NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \ + intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ + if (slot != NVTX_EXTENSION_DISABLED) { \ + if (slot != NVTX_EXTENSION_FRESH) { \ + return (*(fn_name##_impl_fntype)slot) arg_names; \ + } else { \ + NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \ + /* Re-read function slot after extension initialization. */ \ + slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ + if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \ + return (*(fn_name##_impl_fntype)slot) arg_names; \ + } \ + } \ + } \ + NVTX_EXT_FN_RETURN_INVALID(ret_type) \ +} + +#endif /*NVTX_DISABLE*/ + +/* Non-void functions. */ +#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1); + +NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetProcessWidePermissions, (nvtxDomainHandle_t domain), (domain)) + +NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetDeviceWidePermissions, (nvtxDomainHandle_t domain, int device), (domain, device)) + +#undef NVTX_EXT_FN_RETURN_INVALID +/* END: Non-void functions. */ + +/* void functions. */ +#define NVTX_EXT_FN_RETURN_INVALID(rtype) +#define return + +NVTX_EXT_FN_IMPL(void, nvtxMemCudaSetPeerAccess, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, int devicePeer, uint32_t flags), (domain, permissions, devicePeer, flags)) + +#undef return +#undef NVTX_EXT_FN_RETURN_INVALID +/* END: void functions. */ + +#undef NVTX_EXT_FN_IMPL + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h b/src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h new file mode 100644 index 000000000..7e316d379 --- /dev/null +++ b/src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h @@ -0,0 +1,133 @@ +/* +* Copyright 2009-2020,2023 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#ifndef NVTX_EXT_IMPL_MEM_GUARD +#error Never include this file directly -- it is automatically included by nvToolsExtMem.h (except when NVTX_NO_IMPL is defined). +#endif + +#define NVTX_EXT_IMPL_GUARD +#include "nvtxExtImpl.h" +#undef NVTX_EXT_IMPL_GUARD + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#define NVTXMEM_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) NAME##_v##VERSION##_mem##COMPATID +#define NVTXMEM_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) NVTXMEM_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) +#define NVTX_EXT_MEM_VERSIONED_ID(NAME) NVTXMEM_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_MEM) + +#ifdef NVTX_DISABLE + +#include "nvtxExtHelperMacros.h" + +#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \ +ret_val fn_name signature { \ + NVTX_EXT_HELPER_UNUSED_ARGS arg_names \ + return ((ret_val)(intptr_t)-1); \ +} + +#else /* NVTX_DISABLE */ + +/* + * Function slots for the memory extension. First entry is the module + * state, initialized to `0` (`NVTX_EXTENSION_FRESH`). + */ +NVTX_LINKONCE_DEFINE_GLOBAL intptr_t +NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_MEM_FN_NUM + 2] + = {0}; + +NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)() +{ + intptr_t* fnSlots = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots) + 1; + nvtxExtModuleSegment_t segment = { + 0, /* unused (only one segment) */ + NVTX3EXT_CBID_MEM_FN_NUM, + fnSlots + }; + + nvtxExtModuleInfo_t module = { + NVTX_VERSION, sizeof(nvtxExtModuleInfo_t), + NVTX_EXT_MODULEID_MEM, NVTX_EXT_COMPATID_MEM, + 1, &segment, + NULL, /* no export function needed */ + NULL + }; + + NVTX_INFO( "%s\n", __FUNCTION__ ); + + NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module, + NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)); +} + +#define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \ +typedef ret_type ( * fn_name##_impl_fntype )signature; \ + NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \ + intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ + if (slot != NVTX_EXTENSION_DISABLED) { \ + if (slot != NVTX_EXTENSION_FRESH) { \ + return (*(fn_name##_impl_fntype)slot) arg_names; \ + } else { \ + NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \ + /* Re-read function slot after extension initialization. */ \ + slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ + if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \ + return (*(fn_name##_impl_fntype)slot) arg_names; \ + } \ + } \ + } \ + NVTX_EXT_FN_RETURN_INVALID(ret_type) \ +} + +#endif /*NVTX_DISABLE*/ + +/* Non-void functions. */ +#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1); + +NVTX_EXT_FN_IMPL(nvtxMemHeapHandle_t, nvtxMemHeapRegister, (nvtxDomainHandle_t domain, nvtxMemHeapDesc_t const* desc), (domain, desc)) + +NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemPermissionsCreate, (nvtxDomainHandle_t domain, int32_t creationflags), (domain, creationflags)) + +#undef NVTX_EXT_FN_RETURN_INVALID +/* END: Non-void functions. */ + +/* void functions. */ +#define NVTX_EXT_FN_RETURN_INVALID(rtype) +#define return + +NVTX_EXT_FN_IMPL(void, nvtxMemHeapUnregister, (nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap), (domain, heap)) + +NVTX_EXT_FN_IMPL(void, nvtxMemHeapReset, (nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap), (domain, heap)) + +NVTX_EXT_FN_IMPL(void, nvtxMemRegionsRegister, (nvtxDomainHandle_t domain, nvtxMemRegionsRegisterBatch_t const* desc), (domain, desc)) + +NVTX_EXT_FN_IMPL(void, nvtxMemRegionsResize, (nvtxDomainHandle_t domain,nvtxMemRegionsResizeBatch_t const* desc), (domain, desc)) + +NVTX_EXT_FN_IMPL(void, nvtxMemRegionsUnregister, (nvtxDomainHandle_t domain,nvtxMemRegionsUnregisterBatch_t const* desc), (domain, desc)) + +NVTX_EXT_FN_IMPL(void, nvtxMemRegionsName, (nvtxDomainHandle_t domain,nvtxMemRegionsNameBatch_t const* desc), (domain, desc)) + +NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsAssign, (nvtxDomainHandle_t domain,nvtxMemPermissionsAssignBatch_t const* desc), (domain, desc)) + +NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsDestroy, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions), (domain, permissions)) + +NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsReset, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions), (domain, permissions)) + +NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsBind, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, uint32_t bindScope, uint32_t bindFlags), (domain, permissions, bindScope, bindFlags)) + +NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsUnbind, (nvtxDomainHandle_t domain, uint32_t bindScope), (domain, bindScope)) + +#undef return +#undef NVTX_EXT_FN_RETURN_INVALID +/* END: void functions. */ + +#undef NVTX_EXT_FN_IMPL + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h b/src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h new file mode 100644 index 000000000..8f9c79961 --- /dev/null +++ b/src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h @@ -0,0 +1,155 @@ +/* +* Copyright 2021-2023 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD +#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined). +#endif + +#define NVTX_EXT_IMPL_GUARD +#include "nvtxExtImpl.h" +#undef NVTX_EXT_IMPL_GUARD + +#ifndef NVTX_EXT_IMPL_PAYLOAD_V1 +#define NVTX_EXT_IMPL_PAYLOAD_V1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* Macros to create versioned symbols. */ +#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \ + NAME##_v##VERSION##_bpl##COMPATID +#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \ + NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) +#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \ + NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_PAYLOAD_COMPATID) + +#ifdef NVTX_DISABLE + +#include "nvtxExtHelperMacros.h" + +#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \ +ret_val fn_name signature { \ + NVTX_EXT_HELPER_UNUSED_ARGS arg_names \ + return ((ret_val)(intptr_t)-1); \ +} + +#else /* NVTX_DISABLE */ + +#include "nvtxExtPayloadTypeInfo.h" + +/* + * Function slots for the payload extension. First entry is the module state, + * initialized to `0` (`NVTX_EXTENSION_FRESH`). + */ +#define NVTX_EXT_PAYLOAD_SLOT_COUNT 63 +NVTX_LINKONCE_DEFINE_GLOBAL intptr_t +NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX_EXT_PAYLOAD_SLOT_COUNT + 1] + = {0}; + +/* Avoid warnings about missing prototype. */ +NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(void); +NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)() +{ + intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1; + nvtxExtModuleSegment_t segment = { + 0, /* unused (only one segment) */ + NVTX_EXT_PAYLOAD_SLOT_COUNT, + fnSlots + }; + + nvtxExtModuleInfo_t module = { + NVTX_VERSION, sizeof(nvtxExtModuleInfo_t), + NVTX_EXT_PAYLOAD_MODULEID, NVTX_EXT_PAYLOAD_COMPATID, + 1, &segment, /* number of segments, segments */ + NULL, /* no export function needed */ + /* bake type sizes and alignment information into program binary */ + &(NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)) + }; + + NVTX_INFO( "%s\n", __FUNCTION__ ); + + NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module, + NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)); +} + +#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \ +typedef ret_type (*fn_name##_impl_fntype)signature; \ + NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \ + intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ + if (slot != NVTX_EXTENSION_DISABLED) { \ + if (slot != NVTX_EXTENSION_FRESH) { \ + return (*(fn_name##_impl_fntype)slot) arg_names; \ + } else { \ + NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \ + /* Re-read function slot after extension initialization. */ \ + slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ + if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \ + return (*(fn_name##_impl_fntype)slot) arg_names; \ + } \ + } \ + } \ + NVTX_EXT_FN_RETURN_INVALID(ret_type) \ +} + +#endif /*NVTX_DISABLE*/ + +/* Non-void functions. */ +#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1); + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadSchemaRegister, + (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), + (domain, attr)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadEnumRegister, + (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), + (domain, attr)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePushPayload, + (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count), + (domain, payloadData, count)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePopPayload, + (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count), + (domain, payloadData, count)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(nvtxRangeId_t, nvtxRangeStartPayload, + (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count), + (domain, payloadData, count)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint8_t, nvtxDomainIsEnabled, (nvtxDomainHandle_t domain), (domain)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxScopeRegister, (nvtxDomainHandle_t domain, + const nvtxScopeAttr_t* attr), (domain, attr)) + +#undef NVTX_EXT_FN_RETURN_INVALID +/* END: Non-void functions. */ + +/* void functions. */ +#define NVTX_EXT_FN_RETURN_INVALID(rtype) +#define return + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(void, nvtxMarkPayload, (nvtxDomainHandle_t domain, + const nvtxPayloadData_t* payloadData, size_t count), (domain, payloadData, count)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(void, nvtxRangeEndPayload, (nvtxDomainHandle_t domain, + nvtxRangeId_t id, const nvtxPayloadData_t* payloadData, size_t count), + (domain, id, payloadData, count)) + +#undef return +#undef NVTX_EXT_FN_RETURN_INVALID +/* END: void functions. */ + +/* Keep NVTX_EXT_PAYLOAD_IMPL_FN_V1 defined for a future version of this extension. */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* NVTX_EXT_IMPL_PAYLOAD_V1 */ + diff --git a/src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h b/src/include/nvtx3/nvtxDetail/nvtxExtInit.h similarity index 71% rename from src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h rename to src/include/nvtx3/nvtxDetail/nvtxExtInit.h index 724c217a5..abb993e2d 100644 --- a/src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h +++ b/src/include/nvtx3/nvtxDetail/nvtxExtInit.h @@ -1,5 +1,5 @@ /* -* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. +* Copyright 2009-2023 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. @@ -22,7 +22,7 @@ extern "C" { #define NVTX_PATHCHAR wchar_t #define NVTX_STR(x) L##x #define NVTX_GETENV _wgetenv -#define NVTX_BUFSIZE MAX_PATH +#define NVTX_BUFSIZE 16384 #define NVTX_DLLHANDLE HMODULE #define NVTX_DLLOPEN(x) LoadLibraryW(x) #define NVTX_DLLFUNC GetProcAddress @@ -39,14 +39,14 @@ extern "C" { #define NVTX_PATHCHAR char #define NVTX_STR(x) x #define NVTX_GETENV getenv -#define NVTX_BUFSIZE PATH_MAX +#define NVTX_BUFSIZE 16384 #define NVTX_DLLHANDLE void* #define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY) #define NVTX_DLLFUNC dlsym #define NVTX_DLLCLOSE dlclose #define NVTX_YIELD() sched_yield() #define NVTX_MEMBAR() __sync_synchronize() -/* Ensure full memory barrier for atomics, to match Windows functions */ +/* Ensure full memory barrier for atomics, to match Windows functions. */ #define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value) #define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand) #define NVTX_ATOMIC_WRITE_PTR(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value) @@ -63,7 +63,7 @@ extern "C" { #define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0 #endif -/* Define this to 1 for platforms that support environment variables */ +/* Define this to 1 for platforms that support environment variables. */ /* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */ /* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */ #define NVTX_SUPPORT_ENV_VARS 1 @@ -72,16 +72,16 @@ extern "C" { #define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1 /* Injection libraries implementing InitializeInjectionNvtxExtension may be statically linked, -* and this will override any dynamic injection. Useful for platforms where dynamic -* injection is not available. Since weak symbols not explicitly marked extern are -* guaranteed to be initialized to zero if no definitions are found by the linker, the -* dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */ + * which will override any dynamic injection. This is useful for platforms, where dynamic + * injection is not available. Since weak symbols, not explicitly marked extern, are + * guaranteed to be initialized to zero, if no definitions are found by the linker, the + * dynamic injection process proceeds normally, if pfnInitializeInjectionNvtx2 is 0. */ #if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__) #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1 /* To statically inject an NVTX library, define InitializeInjectionNvtxExtension_fnptr as a normal -* symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension (which -* does not need to be named "InitializeInjectionNvtxExtension" as is necessary in a dynamic -* injection library. */ + * symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension, which + * does not need to be named "InitializeInjectionNvtxExtension" as it is necessary in a dynamic + * injection library. */ __attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxExtension_fnptr; #else #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0 @@ -89,35 +89,37 @@ __attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxEx -/* This function tries to find or load an NVTX injection library and get the -* address of its InitializeInjectionExtension function. If such a function pointer -* is found, it is called, and passed the address of this NVTX instance's -* nvtxGetExportTable function, so the injection can attach to this instance. -* If the initialization fails for any reason, any dynamic library loaded will -* be freed, and all NVTX implementation functions will be set to no-ops. If -* initialization succeeds, NVTX functions not attached to the tool will be set -* to no-ops. This is implemented as one function instead of several small -* functions to minimize the number of weak symbols the linker must resolve. -* Order of search is: -* - Pre-injected library exporting InitializeInjectionNvtxExtension -* - Loadable library exporting InitializeInjectionNvtxExtension -* - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64) -* - On Android, libNvtxInjection??.so within the package (?? is 32 or 64) -* - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr -*/ -NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr); -NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr) +/* This function tries to find or load an NVTX injection library and get the address of its + * `InitializeInjectionExtension` function. If such a function pointer is found, it is called and + * passed the address of this NVTX instance's `nvtxGetExportTable` function, so that the injection + * can attach to this instance. + * If the initialization fails for any reason, any dynamic library loaded will be freed, and all + * NVTX implementation functions will be set to no-ops. If the initialization succeeds, NVTX + * functions that are not attached to the tool will be set to no-ops. This is implemented as one + * function instead of several small functions to minimize the number of weak symbols the linker + * must resolve. The order of search is: + * 1) Pre-injected library exporting InitializeInjectionNvtxExtension + * 2) Loadable library exporting InitializeInjectionNvtxExtension + * - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64) + * - On Android, libNvtxInjection??.so within the package (?? is 32 or 64) + * 3) Statically-linked injection library defining InitializeInjectionNvtx2_fnptr + */ +NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)( + NvtxExtInitializeInjectionFunc_t* out_init_fnptr); +NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)( + NvtxExtInitializeInjectionFunc_t* out_init_fnptr) { const char* const initFuncName = "InitializeInjectionNvtxExtension"; NvtxExtInitializeInjectionFunc_t init_fnptr = (NvtxExtInitializeInjectionFunc_t)0; NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0; - if(out_init_fnptr){ + if (out_init_fnptr) + { *out_init_fnptr = (NvtxExtInitializeInjectionFunc_t)0; } #if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY - /* Use POSIX global symbol chain to query for init function from any module */ + /* Use POSIX global symbol chain to query for init function from any module. */ init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(0, initFuncName); #endif @@ -127,7 +129,7 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection { #if NVTX_SUPPORT_ENV_VARS /* If env var NVTX_INJECTION64_PATH is set, it should contain the path - * to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */ + to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */ const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4) ? NVTX_STR("NVTX_INJECTION32_PATH") : NVTX_STR("NVTX_INJECTION64_PATH"); @@ -135,12 +137,12 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE]; const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0; - /* Refer to this variable explicitly in case all references to it are #if'ed out */ + /* Refer to this variable explicitly in case all references to it are #if'ed out. */ (void)injectionLibraryPathBuf; #if NVTX_SUPPORT_ENV_VARS /* Disable the warning for getenv & _wgetenv -- this usage is safe because - * these functions are not called again before using the returned value. */ + these functions are not called again before using the returned value. */ #if defined(_MSC_VER) #pragma warning( push ) #pragma warning( disable : 4996 ) @@ -188,7 +190,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection pkgName[bytesRead] = 0; - /* String can contain colon as a process separator. In this case the package name is before the colon. */ + /* String can contain colon as a process separator. In this case the + package name is before the colon. */ pos = 0; while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0') { @@ -223,8 +226,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection } #endif - /* At this point, injectionLibraryPath is specified if a dynamic - * injection library was specified by a tool. */ + /* At this point, `injectionLibraryPath` is specified if a dynamic + injection library was specified by a tool. */ if (injectionLibraryPath) { /* Load the injection library */ @@ -236,7 +239,7 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection } else { - /* Attempt to get the injection library's entry-point */ + /* Attempt to get the injection library's entry-point. */ init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName); if (!init_fnptr) { @@ -252,8 +255,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection #if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY if (!init_fnptr) { - /* Check weakly-defined function pointer. A statically-linked injection can define this as - * a normal symbol and it will take precedence over a dynamic injection. */ + /* Check weakly-defined function pointer. A statically-linked injection can define + this as a normal symbol and it will take precedence over a dynamic injection. */ if (InitializeInjectionNvtxExtension_fnptr) { init_fnptr = InitializeInjectionNvtxExtension_fnptr; @@ -261,13 +264,13 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection } #endif - if(out_init_fnptr){ + if (out_init_fnptr) + { *out_init_fnptr = init_fnptr; } - /* At this point, if init_fnptr is not set, then no tool has specified - * an NVTX injection library -- return non-success result so all NVTX - * API functions will be set to no-ops. */ + /* At this point, if `init_fnptr` is not set, no tool has specified an NVTX injection library. + Non-success result is returned, so that all NVTX API functions will be set to no-ops. */ if (!init_fnptr) { return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE; @@ -276,16 +279,19 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection return NVTX_SUCCESS; } +/* Avoid warnings about missing prototypes. */ +NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) ( + nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState); NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) ( - nvtxExtModuleInfo_t* moduleInfo, - intptr_t* moduleState - ) + nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState) { intptr_t old; NVTX_INFO( "%s\n", __FUNCTION__ ); - if( *moduleState == NVTX_EXTENSION_LOADED) { + if (*moduleState == NVTX_EXTENSION_LOADED) + { + NVTX_INFO("Module loaded\n"); return; } @@ -296,45 +302,55 @@ NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) ( NVTX_EXTENSION_FRESH); if (old == NVTX_EXTENSION_FRESH) { - NvtxExtInitializeInjectionFunc_t init_fnptr = NVTX_VERSIONED_IDENTIFIER(injectionFnPtr); + NvtxExtInitializeInjectionFunc_t init_fnptr = + NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr; int entryPointStatus = 0; int forceAllToNoops = 0; + size_t s; - /* Load & initialize injection library -- it will assign the function pointers */ - if(init_fnptr == 0){ + /* Load and initialize injection library, which will assign the function pointers. */ + if (init_fnptr == 0) + { int result = 0; - /* try to load vanilla NVTX first*/ + /* Try to load vanilla NVTX first. */ nvtxInitialize(0); result = NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(&init_fnptr); - /*at this point init_fnptr will be either 0 or a real function*/ + /* At this point `init_fnptr` will be either 0 or a real function. */ - if(result == NVTX_SUCCESS) { - NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = init_fnptr; + if (result == NVTX_SUCCESS) + { + NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr = init_fnptr; } - else { + else + { NVTX_ERR("Failed to load injection library\n"); } } - if(init_fnptr != 0) { - /* Invoke injection library's initialization function. If it returns - * 0 (failure) and a dynamic injection was loaded, unload it. */ + if (init_fnptr != 0) + { + /* Invoke injection library's initialization function. If it returns + 0 (failure) and a dynamic injection was loaded, unload it. */ entryPointStatus = init_fnptr(moduleInfo); - if (entryPointStatus == 0) { + if (entryPointStatus == 0) + { NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n"); } } - /* Clean up any functions that are still uninitialized so that they are skipped. - * Set all to null if injection init function failed as well. - */ + /* Clean up any functions that are still uninitialized so that they are + skipped. Set all to null if injection init function failed as well. */ forceAllToNoops = (init_fnptr == 0) || (entryPointStatus == 0); - for(size_t s = 0; s < moduleInfo->segmentsCount; ++s){ - nvtxExtModuleSegment_t* segment = moduleInfo->segments+s; - for(size_t i = 0; i < segment->slotCount; ++i){ - if(forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH)){ + for (s = 0; s < moduleInfo->segmentsCount; ++s) + { + nvtxExtModuleSegment_t* segment = moduleInfo->segments + s; + size_t i; + for (i = 0; i < segment->slotCount; ++i) + { + if (forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH)) + { segment->functionSlots[i] = NVTX_EXTENSION_DISABLED; } } @@ -342,12 +358,11 @@ NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) ( NVTX_MEMBAR(); - /* Signal that initialization has finished, so now the assigned function pointers will be used */ - NVTX_ATOMIC_WRITE_PTR( - moduleState, - NVTX_EXTENSION_LOADED); + /* Signal that initialization has finished and the assigned function + pointers will be used. */ + NVTX_ATOMIC_WRITE_PTR(moduleState, NVTX_EXTENSION_LOADED); } - else /* Spin-wait until initialization has finished */ + else /* Spin-wait until initialization has finished. */ { NVTX_MEMBAR(); while (*moduleState != NVTX_EXTENSION_LOADED) diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h new file mode 100644 index 000000000..71e30bc37 --- /dev/null +++ b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h @@ -0,0 +1,272 @@ +/* +* Copyright 2023 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#ifndef NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H +#define NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H + +/* General helper macros */ +#include "nvtxExtHelperMacros.h" + +/* Get variable name with line number (almost unique per file). */ +#define _NVTX_PAYLOAD_DATA_VAR NVTX_EXT_CONCAT(nvtxDFDB,__LINE__) + +/* Create real arguments from just pasting tokens next to each other. */ +#define _NVTX_PAYLOAD_PASS_THROUGH(...) __VA_ARGS__ + +/* Avoid prefixing `NVTX_PAYLOAD_ENTRY_` for nested payloads. */ +#define NVTX_PAYLOAD_ENTRY_THROWAWAY +#define _NVTX_PAYLOAD_NESTED(id) THROWAWAY id + +/* + * Create the NVTX binary payloads schema attributes. + * + * @param struct_id The name of the struct. + * @param schema_name The name of the schema. + * @param schema_flags Additional schema flags + * @param mask_add Fields to be added to the mask. + * @param num_entries The number schema entries. + */ +#define NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, mask_add, num_entries) \ + nvtxPayloadSchemaAttr_t struct_id##Attr = { \ + /*.fieldMask = */NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | mask_add \ + NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | \ + NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | \ + NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE, \ + /*.name = */schema_name, \ + /*.type = */NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, \ + /*.flags = */schema_flags, \ + /*.entries = */struct_id##Schema, /*.numEntries = */num_entries, \ + /*.payloadStaticSize = */sizeof(struct_id), \ + /*.packAlign = */0, /*.schemaId = */schema_id}; + + +/*****************************************************************/ +/*** Helper for `NVTX_DEFINE_SCHEMA_FOR_STRUCT[_AND_REGISTER]` ***/ + +/* First part of schema entry for different number of arguments. */ +#define _NVTX_PAYLOAD_SCHEMA_EF2(member, etype) \ + 0, NVTX_PAYLOAD_ENTRY_##etype, NULL, NULL, 0, +#define _NVTX_PAYLOAD_SCHEMA_EF3(member, etype, name) \ + 0, NVTX_PAYLOAD_ENTRY_##etype, name, NULL, 0, +#define _NVTX_PAYLOAD_SCHEMA_EF4(member, etype, name, desc) \ + 0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, 0, +#define _NVTX_PAYLOAD_SCHEMA_EF5(member, etype, name, desc, arraylen) \ + 0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen, +#define _NVTX_PAYLOAD_SCHEMA_EF6(member, etype, name, desc, arraylen, flags) \ + NVTX_PAYLOAD_ENTRY_FLAG_##flags, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen, + +#define _NVTX_PAYLOAD_SCHEMA_ENTRY_FRONT(...) \ + NVTX_EXT_CONCAT(_NVTX_PAYLOAD_SCHEMA_EF, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) + +/* Second part of schema entry (append struct member). + (At least two arguments are passed (`member` and `etype`). */ +#define _NVTX_PAYLOAD_SCHEMA_ENTRY_END(member, ...) member + +/* Resolve to schema entry. `entry` is `(ctype, name, ...)`. */ +#define _NVTX_PAYLOAD_SCHEMA_ENTRY(struct_id, entry) \ + {_NVTX_PAYLOAD_SCHEMA_ENTRY_FRONT entry \ + offsetof(struct_id, _NVTX_PAYLOAD_SCHEMA_ENTRY_END entry)}, + +/* Handle up to 16 schema entries. */ +#define _NVTX_PAYLOAD_SME1(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) +#define _NVTX_PAYLOAD_SME2(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME1(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME3(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME2(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME4(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME3(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME5(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME4(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME6(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME5(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME7(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME6(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME8(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME7(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME9(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME8(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME10(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME9(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME11(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME10(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME12(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME11(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME13(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME12(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME14(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME13(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME15(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME14(s,__VA_ARGS__) +#define _NVTX_PAYLOAD_SME16(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME15(s,__VA_ARGS__) + +#define _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, ...) \ + nvtxPayloadSchemaEntry_t struct_id##Schema[] = { \ + NVTX_EXT_CONCAT(_NVTX_PAYLOAD_SME, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(struct_id, __VA_ARGS__) \ + {0, 0} \ + }; + +/* + * Handle optional parameters for `NVTX_DEFINE_SCHEMA_FOR_STRUCT[_AND_REGISTER]`. + */ +#define _NVTX_DEFINE_S4S_6(struct_id, schema_name, prefix, schema_flags, schema_id, entries) \ + prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, \ + NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID |,\ + NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) +#define _NVTX_DEFINE_S4S_5(struct_id, schema_name, prefix, schema_flags, entries) \ + prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, 0, \ + NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |, \ + NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) +#define _NVTX_DEFINE_S4S_4(struct_id, schema_name, prefix, entries) \ + prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, \ + NVTX_PAYLOAD_SCHEMA_ATTR_NAME |, \ + NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) +#define _NVTX_DEFINE_S4S_3(struct_id, schema_name, entries) \ + _NVTX_DEFINE_S4S_4(struct_id, schema_name, /*prefix*/, entries) +#define _NVTX_DEFINE_S4S_2(struct_id, entries) \ + _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, NULL, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, ,\ + NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) + +#define _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, ...) \ + NVTX_EXT_CONCAT(_NVTX_DEFINE_S4S_, \ + NVTX_EXT_NUM_ARGS(struct_id, __VA_ARGS__))(struct_id, __VA_ARGS__) + +/*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{DEFINE,SETUP}` ***/ + + +/******************************************************************/ +/*** Helper for `NVTX_DEFINE_STRUCT_WITH_SCHEMA[_AND_REGISTER]` ***/ + +/* Extract struct member for fixed-size arrays. */ +#define _NVTX_PAYLOAD_STRUCT_ARR_MEM1(name) name +#define _NVTX_PAYLOAD_STRUCT_ARR_MEM2(name, count) name[count] + +/* Extract type and member name and handle special case of fixed-size array. */ +#define _NVTX_PAYLOAD_STRUCT_E2(type, member) type member; +#define _NVTX_PAYLOAD_STRUCT_E3(type, member, etype) type member; +#define _NVTX_PAYLOAD_STRUCT_E4(type, member, etype, name) type member; +#define _NVTX_PAYLOAD_STRUCT_E5(type, member, etype, name, desc) type member; +#define _NVTX_PAYLOAD_STRUCT_E6(type, member, etype, name, desc, arraylen) \ + type NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT_ARR_MEM, NVTX_EXT_NUM_ARGS member) member; +#define _NVTX_PAYLOAD_STRUCT_E7(type, member, etype, name, desc, arraylen, flags) \ + _NVTX_PAYLOAD_STRUCT_E6(type, member, etype, name, desc, arraylen) + +/* Handle different number of arguments per struct entry. */ +#define _NVTX_PAYLOAD_STRUCT_ENTRY_(...) \ + NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT_E, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) + +/* Handle up to 16 struct members. */ +#define _NVTX_PAYLOAD_STRUCT_ENTRY(entry) _NVTX_PAYLOAD_STRUCT_ENTRY_ entry +#define _NVTX_PAYLOAD_STRUCT1(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) +#define _NVTX_PAYLOAD_STRUCT2(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT1(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT3(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT2(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT4(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT3(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT5(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT4(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT6(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT5(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT7(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT6(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT8(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT7(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT9(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT8(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT10(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT9(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT11(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT10(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT12(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT11(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT13(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT12(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT14(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT13(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT15(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT14(__VA_ARGS__) +#define _NVTX_PAYLOAD_STRUCT16(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT15(__VA_ARGS__) + +/* Generate the typedef. */ +#define _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, ...) \ + typedef struct { \ + NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) \ + } struct_id; + +/* Generate first part of the schema entry. */ +#define _NVTX_PAYLOAD_INIT_SCHEMA_N3(type, memberId, etype) \ + 0, NVTX_PAYLOAD_ENTRY_##etype, NULL, NULL, 0, +#define _NVTX_PAYLOAD_INIT_SCHEMA_N4(type, memberId, etype, name) \ + 0, NVTX_PAYLOAD_ENTRY_##etype, name, NULL, 0, +#define _NVTX_PAYLOAD_INIT_SCHEMA_N5(type, memberId, etype, name, desc) \ + 0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, 0, +#define _NVTX_PAYLOAD_INIT_SCHEMA_N6(type, memberId, etype, name, desc, arraylen) \ + 0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen, +#define _NVTX_PAYLOAD_INIT_SCHEMA_N7(type, memberId, etype, name, desc, arraylen, flags) \ + NVTX_PAYLOAD_ENTRY_FLAG_##flags, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen, + +#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_FRONT(...) \ + NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SCHEMA_N, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) + +#define _NVTX_PAYLOAD_ARRAY_MEMBER1(name) name +#define _NVTX_PAYLOAD_ARRAY_MEMBER2(name, count) name + +/* Resolve to last part of schema entry (append struct member). */ +#define _NVTX_PAYLOAD_INIT_SCHEMA_NX3(type, memberId, ...) memberId +#define _NVTX_PAYLOAD_INIT_SCHEMA_NX4(type, memberId, ...) memberId +#define _NVTX_PAYLOAD_INIT_SCHEMA_NX5(type, memberId, ...) memberId +#define _NVTX_PAYLOAD_INIT_SCHEMA_NX6(type, memberId, ...) \ + NVTX_EXT_CONCAT(_NVTX_PAYLOAD_ARRAY_MEMBER, NVTX_EXT_NUM_ARGS memberId) memberId +#define _NVTX_PAYLOAD_INIT_SCHEMA_NX7(type, memberId, ...) \ + _NVTX_PAYLOAD_INIT_SCHEMA_NX6(type, memberId, __VA_ARGS__) + +#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_END(...) \ + NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SCHEMA_NX, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) + +/* Resolve to schema entry. `entry` is `(ctype, name, ...)`. */ +#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(struct_id, entry) \ + {_NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_FRONT entry \ + offsetof(struct_id, _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_END entry)}, + +/* Handle up to 16 schema entries. */ +#define _NVTX_PAYLOAD_INIT_SME1(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) +#define _NVTX_PAYLOAD_INIT_SME2(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME1(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME3(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME2(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME4(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME3(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME5(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME4(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME6(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME5(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME7(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME6(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME8(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME7(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME9(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME8(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME10(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME9(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME11(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME10(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME12(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME11(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME13(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME12(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME14(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME13(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME15(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME14(s, __VA_ARGS__) +#define _NVTX_PAYLOAD_INIT_SME16(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME15(s, __VA_ARGS__) + +#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, ...) \ + nvtxPayloadSchemaEntry_t struct_id##Schema[] = { \ + NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SME, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(struct_id, __VA_ARGS__) \ + {0, 0} \ + }; + +/* + * Handle optional parameters for `NVTX_DEFINE_STRUCT_WITH_SCHEMA[_AND_REGISTER]`. + */ +#define _NVTX_DEFINE_SWS_6(struct_id, schema_name, prefix, schema_flags, schema_id, entries) \ + _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, \ + NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | \ + NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID |, \ + NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) +#define _NVTX_DEFINE_SWS_5(struct_id, schema_name, prefix, schema_flags, entries) \ + _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, 0, \ + NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |, \ + NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) +#define _NVTX_DEFINE_SWS_4(struct_id, schema_name, prefix, entries) \ + _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, \ + NVTX_PAYLOAD_SCHEMA_ATTR_NAME |, \ + NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) +#define _NVTX_DEFINE_SWS_3(struct_id, schema_name, entries) \ + _NVTX_DEFINE_SWS_4(struct_id, schema_name, /* no prefix */, entries) +#define _NVTX_DEFINE_SWS_2(struct_id, entries) \ + _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, NULL, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, , \ + NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries)) + +#define _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, ...) \ + NVTX_EXT_CONCAT(_NVTX_DEFINE_SWS_, \ + NVTX_EXT_NUM_ARGS(struct_id, __VA_ARGS__))(struct_id, __VA_ARGS__) + +/*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{INIT,CREATE}` */ + +#endif /* NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H */ \ No newline at end of file diff --git a/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h similarity index 90% rename from src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h rename to src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h index 7c166bd34..6a30e6633 100644 --- a/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h +++ b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h @@ -10,14 +10,14 @@ #error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined). #endif -typedef void* pointer_type; +typedef void* nvtx_payload_pointer_type; #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) #include #include #endif -/* `alignof` is available as of C11 or C++11 */ +/* `alignof` is available as of C11 or C++11. */ #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L) #define nvtx_alignof(type) alignof(type) @@ -54,7 +54,7 @@ MKTYPEDEF(double); MKTYPEDEF2(long double, longdouble); MKTYPEDEF(size_t); -MKTYPEDEF(pointer_type); +MKTYPEDEF(nvtx_payload_pointer_type); MKTYPEDEF(wchar_t); @@ -85,8 +85,16 @@ MKTYPEDEF(wchar_t); /* * Helper array to get the alignment for each predefined C/C++ language type. * The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`. + * + * In C++, `const` variables use internal linkage by default, but we need it to + * be public (extern) since weak declarations must be public. */ -const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] = +NVTX_LINKONCE_DEFINE_GLOBAL +#ifdef __cplusplus +extern +#endif +const nvtxPayloadEntryTypeInfo_t +NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] = { /* The first entry contains this array's length and the size of each entry in this array. */ {NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE, sizeof(nvtxPayloadEntryTypeInfo_t)}, @@ -119,7 +127,7 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_ /* NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE */ {sizeof(long double), nvtx_alignof2(long double, longdouble)}, /* NVTX_PAYLOAD_ENTRY_TYPE_SIZE */ {sizeof(size_t), nvtx_alignof(size_t)}, - /* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(pointer_type), nvtx_alignof(pointer_type)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(nvtx_payload_pointer_type), nvtx_alignof(nvtx_payload_pointer_type)}, /*** Special character types ***/ /* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)}, @@ -140,4 +148,4 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_ }; #undef nvtx_alignof -#undef nvtx_alignof2 +#undef nvtx_alignof2 \ No newline at end of file diff --git a/src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h b/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h similarity index 100% rename from src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h rename to src/include/nvtx3/nvtxDetail/nvtxExtTypes.h diff --git a/src/include/nvtx3/nvtxDetail/nvtxImpl.h b/src/include/nvtx3/nvtxDetail/nvtxImpl.h index 590ce9024..5ffc4abb4 100644 --- a/src/include/nvtx3/nvtxDetail/nvtxImpl.h +++ b/src/include/nvtx3/nvtxDetail/nvtxImpl.h @@ -10,37 +10,34 @@ #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif +#include +#include +#include +#include + /* ---- Include required platform headers ---- */ -#if defined(_WIN32) +#if defined(_WIN32) -#include +#include #else #include #if defined(__ANDROID__) -#include +#include #endif #if defined(__linux__) || defined(__CYGWIN__) #include #endif +#include #include #include #include -#include -#include -#include -#include #include - -#include -#include #include -#include -#include #endif diff --git a/src/include/nvtx3/nvtxDetail/nvtxInit.h b/src/include/nvtx3/nvtxDetail/nvtxInit.h index 43cad7010..03568f149 100644 --- a/src/include/nvtx3/nvtxDetail/nvtxInit.h +++ b/src/include/nvtx3/nvtxDetail/nvtxInit.h @@ -14,11 +14,11 @@ /* Prefer macros over inline functions to reduce symbol resolution at link time */ -#if defined(_WIN32) +#if defined(_WIN32) #define NVTX_PATHCHAR wchar_t #define NVTX_STR(x) L##x #define NVTX_GETENV _wgetenv -#define NVTX_BUFSIZE MAX_PATH +#define NVTX_BUFSIZE 16384 #define NVTX_DLLHANDLE HMODULE #define NVTX_DLLOPEN(x) LoadLibraryW(x) #define NVTX_DLLFUNC GetProcAddress @@ -31,7 +31,7 @@ #define NVTX_PATHCHAR char #define NVTX_STR(x) x #define NVTX_GETENV getenv -#define NVTX_BUFSIZE PATH_MAX +#define NVTX_BUFSIZE 16384 #define NVTX_DLLHANDLE void* #define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY) #define NVTX_DLLFUNC dlsym diff --git a/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h b/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h index 57661c754..28e765581 100644 --- a/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h +++ b/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h @@ -23,7 +23,7 @@ * In some situations it is desirable to declare a variable without initializing * it, refer to it in code or other variables' initializers, and then initialize * it later. Similarly, functions can be prototyped, have their address taken, - * and then have their body defined later. In such cases, use the FWDDECL macros + * and then have their body defined later. In such cases, use the FWDDECL macros * when forward-declaring LINKONCE global variables without initializers and * function prototypes, and then use the DEFINE macros when later defining them. * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro, diff --git a/src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h b/src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h deleted file mode 100644 index 4663fda82..000000000 --- a/src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h +++ /dev/null @@ -1,86 +0,0 @@ -/* -* Copyright 2021 NVIDIA Corporation. All rights reserved. -* -* Licensed under the Apache License v2.0 with LLVM Exceptions. -* See https://llvm.org/LICENSE.txt for license information. -* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -*/ - -#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD -#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined). -#endif - -#define NVTX_EXT_IMPL_GUARD -#include "nvtxExtImpl.h" -#undef NVTX_EXT_IMPL_GUARD - -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - -#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \ - NAME##_v##VERSION##_mem##COMPATID -#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \ - NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) -#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \ - NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_PAYLOAD) - -/* - * Function slots for the binary payload extension. First entry is the module - * state, initialized to `0` (`NVTX_EXTENSION_FRESH`). - */ -NVTX_LINKONCE_DEFINE_GLOBAL intptr_t -NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM + 1] - = {0}; - -NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)() -{ - intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1; - nvtxExtModuleSegment_t segment = { - 0, // unused (only one segment) - NVTX3EXT_CBID_PAYLOAD_FN_NUM, - fnSlots - }; - - nvtxExtModuleInfo_t module = { - NVTX_VERSION, sizeof(nvtxExtModuleInfo_t), - NVTX_EXT_MODULEID_PAYLOAD, NVTX_EXT_COMPATID_PAYLOAD, - 1, &segment, // number of segments, segments - NULL, // no export function needed - // bake type sizes and alignment information into program binary - &nvtxExtPayloadTypeInfo - }; - - NVTX_INFO( "%s\n", __FUNCTION__ ); - - NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module, - NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)); -} - -#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \ -typedef ret_val ( * fn_name##_impl_fntype )signature; \ -NVTX_LINKONCE_DEFINE_FUNCTION ret_val fn_name signature { \ - intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ - if (slot != NVTX_EXTENSION_DISABLED) { \ - if (slot) { \ - return (*(fn_name##_impl_fntype)slot) arg_names; \ - } else { \ - NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \ - slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ - if (slot != NVTX_EXTENSION_DISABLED && slot) { \ - return (*(fn_name##_impl_fntype)slot) arg_names; \ - } \ - } \ - } \ - return ((ret_val)(intptr_t)-1); \ -} - -NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadSchemaRegister, (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), (domain, attr)) - -NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadEnumRegister, (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), (domain, attr)) - -#undef NVTX_EXT_FN_IMPL - -#ifdef __cplusplus -} /* extern "C" */ -#endif /* __cplusplus */ \ No newline at end of file diff --git a/src/include/p2p.h b/src/include/p2p.h index 9a3dbdb3b..5c73a6cf1 100644 --- a/src/include/p2p.h +++ b/src/include/p2p.h @@ -10,6 +10,9 @@ #define NCCL_P2P_H_ #include +#include + +#include "core.h" #if CUDART_VERSION < 12030 // MNNVL: FABRIC handle support lifted from CUDA 12.3 diff --git a/src/include/proxy.h b/src/include/proxy.h index cb1c3b200..eab6930fe 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -16,13 +16,29 @@ #include "shm.h" #include "p2p.h" +typedef enum : uint8_t { + ncclPatternRing, + ncclPatternRingTwice, + ncclPatternPipelineFrom, + ncclPatternPipelineTo, + ncclPatternTreeUp, + ncclPatternTreeDown, + ncclPatternTreeUpDown, + ncclPatternCollnetChain, + ncclPatternCollnetDirect, + ncclPatternNvls, + ncclPatternNvlsTree, + ncclPatternSend, + ncclPatternRecv +} ncclPattern_t; + enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; struct ncclProxyArgs; typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*); #define NCCL_PROXY_MAX_SUBS MAXCHANNELS -static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements"); +static_assert(2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH <= MAXCHANNELS, "Not enough sub space for max work elements"); union ncclProxyOpSpecifics { struct { @@ -124,8 +140,9 @@ struct ncclProxyArgs { // ProxyOps are used to communicate between main thread and service thread // Make sure we have enough to store two full rounds of operations on all channels. -// Otherwise we'd be unable to post half of them to free new elements. -#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P) +// Otherwise we'd be unable to post half of them to free new elements. Each +// p2p work contains a send and recv proxy op hence the 2x before it. +#define MAX_OPS_PER_PEER (2*MAXCHANNELS*2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH) struct ncclProxyOpsPool { struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS]; @@ -243,7 +260,7 @@ struct ncclProxyState { bool dmaBufSupport; ncclNet_t* ncclNet; ncclCollNet_t* ncclCollNet; - volatile uint32_t* abortFlag; + uint32_t* abortFlag; // Service threads pthread_t thread; pthread_t threadUDS; @@ -301,7 +318,6 @@ enum proxyMode { }; ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire); -ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp, int reg); ncclResult_t ncclProxyStart(struct ncclComm* comm); ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS); ncclResult_t ncclProxyCreate(struct ncclComm* comm); diff --git a/src/include/register.h b/src/include/register.h index d23e0da3e..9f7c83faa 100644 --- a/src/include/register.h +++ b/src/include/register.h @@ -1,6 +1,11 @@ #ifndef NCCL_REGISTER_H_ #define NCCL_REGISTER_H_ +#include "device.h" + +#include +#include + enum { NET_REG_COMPLETE = 0x01, NVLS_REG_COMPLETE = 0x02, diff --git a/src/include/transport.h b/src/include/transport.h index 1671db0e2..07fbb3ec4 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -13,12 +13,14 @@ #include "core.h" #define NTRANSPORTS 4 +#define TRANSPORT_UNDEFINED -1 #define TRANSPORT_P2P 0 #define TRANSPORT_SHM 1 #define TRANSPORT_NET 2 #define TRANSPORT_COLLNET 3 #include "proxy.h" +#include "comm.h" extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; @@ -45,6 +47,7 @@ struct ncclPeerInfo { int cudaCompCap; // MNNVL support nvmlGpuFabricInfoV_t fabricInfo; + int cuMemSupport; }; #define CONNECT_SIZE 128 @@ -57,17 +60,21 @@ struct ncclConnect { #define NVLS_HANDLE_SIZE 64 struct ncclNvlsSharedRes { int refCount; - CUmulticastObjectProp properties; + bool inited; + CUmulticastObjectProp bufProp; + CUmulticastObjectProp signalProp; CUmemAccessDesc accessDesc; int dev; - size_t size; - size_t granularity; - CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer + size_t buffSize; + size_t creditSize; + CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer + CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer char* mcBuff; // Multicast NVLS buffer address - CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer + char* mcCredit; // Multicast NVLS credit address + CUmemGenericAllocationHandle ucBuffHandle; // Unicast Handle for NVLS buffer + CUmemGenericAllocationHandle ucCreditHandle; // Unicast Handle for NVLS credit buffer char* ucBuff; // Unicast NVLS buffer address - char shareableHandle[NVLS_HANDLE_SIZE]; - size_t ucGran; + char* ucCredit; // Unicast NVLS credit address int nChannels; struct ncclShmemCollBuff nvlsShmem; void *nvlsShmemHandle; @@ -84,6 +91,7 @@ struct ncclCollNetSharedRes { void* resources; int nChannels; size_t buffSize; + int intraHighestTransportType; }; struct ncclTransportComm { @@ -111,7 +119,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* ncclResult_t ncclNvlsInit(struct ncclComm* comm); ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent); -ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); +ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm); +ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm); +ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts); ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size); ncclResult_t ncclNvlsFree(struct ncclComm* comm); @@ -121,6 +131,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail); ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm); ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle); -ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKernelPlan *plan, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle); +ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts); ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle); + +ncclResult_t ncclTransportRingConnect(struct ncclComm* comm); +ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm); + +ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]); +ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm); +ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm); + #endif diff --git a/src/include/tuner.h b/src/include/tuner.h index df8f5f21c..285f87e81 100644 --- a/src/include/tuner.h +++ b/src/include/tuner.h @@ -9,14 +9,15 @@ #define NCCL_INT_TUNER_H_ #include "nccl_tuner.h" +#include "comm.h" // Tuning plugin to override NCCL's default algorithm/protocol tuning. // Attempts to load NCCL tuner from environmental variable. // Returns ncclSuccess if the correct tuner symbol has been found and // successully loaded. Otherwise returns an error and also logs the error. -ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner); +ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm); // Cleans up NCCL tuner plugin. -ncclResult_t ncclTunerPluginUnload(ncclTuner_t** tuner); +ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm); #endif diff --git a/src/include/utils.h b/src/include/utils.h index cfc009861..abecf2257 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -9,12 +9,14 @@ #include "nccl.h" #include "alloc.h" +#include "bitops.h" #include "checks.h" #include #include #include #include #include +#include int ncclCudaCompCap(); @@ -30,11 +32,6 @@ uint64_t getHostHash(); uint64_t getPidHash(); ncclResult_t getRandomData(void* buffer, size_t bytes); -const char* ncclOpToString(ncclRedOp_t op); -const char* ncclDatatypeToString(ncclDataType_t type); -const char* ncclAlgoToString(int algo); -const char* ncclProtoToString(int proto); - struct netIf { char prefix[64]; int port; @@ -44,9 +41,7 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList); bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); static long log2i(long n) { - long l = 0; - while (n>>=1) l++; - return l; + return log2Down(n); } inline uint64_t clockNano() { @@ -96,8 +91,11 @@ void ncclMemoryStackConstruct(struct ncclMemoryStack* me); void ncclMemoryStackDestruct(struct ncclMemoryStack* me); void ncclMemoryStackPush(struct ncclMemoryStack* me); void ncclMemoryStackPop(struct ncclMemoryStack* me); +void* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t size, size_t align); template T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1); +template +inline Header* ncclMemoryStackAllocInlineArray(struct ncclMemoryStack* me, size_t nElt); //////////////////////////////////////////////////////////////////////////////// /* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for @@ -140,11 +138,14 @@ T* ncclIntruQueueHead(ncclIntruQueue *me); template void ncclIntruQueueEnqueue(ncclIntruQueue *me, T *x); template +void ncclIntruQueueEnqueueFront(ncclIntruQueue *me, T *x); +template T* ncclIntruQueueDequeue(ncclIntruQueue *me); template T* ncclIntruQueueTryDequeue(ncclIntruQueue *me); template -void ncclIntruQueueFreeAll(ncclIntruQueue *me, ncclMemoryPool *memPool); +void ncclIntruQueueTransfer(ncclIntruQueue *dst, ncclIntruQueue *src); + //////////////////////////////////////////////////////////////////////////////// /* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex" @@ -233,6 +234,12 @@ inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, return obj; } +inline void* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t size, size_t align) { + void *obj = ncclMemoryStack::allocate(me, size, align); + memset(obj, 0, size); + return obj; +} + template inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) { void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T)); @@ -240,6 +247,17 @@ inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) { return (T*)obj; } +template +inline Header* ncclMemoryStackAllocInlineArray(struct ncclMemoryStack* me, size_t nElt) { + size_t size = sizeof(Header); + size = (size + alignof(Element)-1) & -alignof(Element); + size += nElt*sizeof(Element); + size_t align = alignof(Header) < alignof(Element) ? alignof(Element) : alignof(Header); + void *obj = ncclMemoryStack::allocate(me, size, align); + memset(obj, 0, size); + return (Header*)obj; +} + inline void ncclMemoryStackPush(struct ncclMemoryStack* me) { using Frame = ncclMemoryStack::Frame; Frame tmp = me->topFrame; @@ -343,6 +361,13 @@ inline void ncclIntruQueueEnqueue(ncclIntruQueue *me, T *x) { me->tail = x; } +template +inline void ncclIntruQueueEnqueueFront(ncclIntruQueue *me, T *x) { + if (me->head == nullptr) me->tail = x; + x->*next = me->head; + me->head = x; +} + template inline T* ncclIntruQueueDequeue(ncclIntruQueue *me) { T *ans = me->head; @@ -388,45 +413,11 @@ inline T* ncclIntruQueueTryDequeue(ncclIntruQueue *me) { } template -void ncclIntruQueueFreeAll(ncclIntruQueue *me, ncclMemoryPool *pool) { - T *head = me->head; - me->head = nullptr; - me->tail = nullptr; - while (head != nullptr) { - T *tmp = head->*next; - ncclMemoryPoolFree(pool, tmp); - head = tmp; - } -} - -/* cmp function determines the sequence of objects in the queue. If cmp returns value >= 0, it means a > b, - * and we should put a before b; otherwise, b should be put ahead of a. */ -template -inline void ncclIntruQueueSortEnqueue(ncclIntruQueue *me, T *x, int (*cmp)(T *a, T *b)) { - T *cur = me->head; - T *prev = NULL; - - if (cur == NULL) { - x->*next = nullptr; - me->tail = me->head = x; - } else { - while (cur) { - if (cmp(cur, x) > 0) { - prev = cur; - cur = cur->next; - } else { - break; - } - } - - x->*next = cur; - if (prev) { - prev->*next = x; - if (cur == NULL) me->tail = x; - } else { - me->head = x; - } - } +void ncclIntruQueueTransfer(ncclIntruQueue *dst, ncclIntruQueue *src) { + (dst->tail ? dst->tail->next : dst->head) = src->head; + if (src->tail) dst->tail = src->tail; + src->head = nullptr; + src->tail = nullptr; } //////////////////////////////////////////////////////////////////////////////// diff --git a/src/init.cc b/src/init.cc index cecb9bc05..16e02d49c 100644 --- a/src/init.cc +++ b/src/init.cc @@ -44,6 +44,7 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT); +NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1); static ncclResult_t commReclaim(ncclComm_t comm); @@ -71,24 +72,22 @@ ncclResult_t initGdrCopy() { return ncclSuccess; } -pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; -static bool initialized = false; +static ncclResult_t initResult = ncclSuccess; +static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; + +static void initOnceFunc() { + initEnv(); + initGdrCopy(); + // Always initialize bootstrap network + NCCLCHECKGOTO(bootstrapNetInit(), initResult, exit); + + initNvtxRegisteredEnums(); +exit:; +} static ncclResult_t ncclInit() { - if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess; - pthread_mutex_lock(&initLock); - if (!initialized) { - initEnv(); - initGdrCopy(); - // Always initialize bootstrap network - NCCLCHECK(bootstrapNetInit()); - NCCLCHECK(ncclNetPluginInit()); - - initNvtxRegisteredEnums(); - __atomic_store_n(&initialized, true, __ATOMIC_RELEASE); - } - pthread_mutex_unlock(&initLock); - return ncclSuccess; + pthread_once(&initOnceControl, initOnceFunc); + return initResult; } NCCL_API(ncclResult_t, ncclGetVersion, int* version); @@ -172,6 +171,7 @@ void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle) { } static ncclResult_t commFree(ncclComm_t comm) { + int abort = 0; /* commFree() should not involve any sync among ranks. */ if (comm == NULL) return ncclSuccess; @@ -234,8 +234,10 @@ static ncclResult_t commFree(ncclComm_t comm) { ncclMemoryStackDestruct(&comm->memScoped); ncclMemoryStackDestruct(&comm->memPermanent); + abort = *comm->abortFlag; if (ncclAtomicRefCountDecrement(comm->abortFlagRefCount) == 0) { - NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag)); + free(comm->abortFlag); + NCCLCHECK(ncclCudaHostFree((void*)comm->abortFlagDev)); free(comm->abortFlagRefCount); } free((void*)comm->config.netName); @@ -245,7 +247,11 @@ static ncclResult_t commFree(ncclComm_t comm) { NCCLCHECK(ncclRegCleanup(comm)); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy"); + commPoison(comm); // poison comm before free to avoid comm reuse. + NCCLCHECK(ncclNetFinalize(comm)); + NCCLCHECK(ncclNetPluginUnload(comm)); free(comm); return ncclSuccess; @@ -254,7 +260,9 @@ static ncclResult_t commFree(ncclComm_t comm) { NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0); // GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1); -NCCL_PARAM(WorkFifoDepth, "WORK_FIFO_DEPTH", 64<<10); +#define NCCL_WORK_FIFO_BYTES_DEFAULT (1<<20) +NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", NCCL_WORK_FIFO_BYTES_DEFAULT); +NCCL_PARAM(WorkArgsBytes, "WORK_ARGS_BYTES", INT64_MAX); enum ncclLaunchMode ncclParamLaunchMode; NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 1); @@ -281,7 +289,7 @@ static ncclResult_t dmaBufSupported(struct ncclComm* comm) { ncclResult_t ncclCommEnsureReady(ncclComm_t comm) { /* comm must be ready, or error will be reported */ ncclResult_t ret = ncclSuccess; - if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) { + if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) { ncclGroupJobAbort(comm->groupJob); } else { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); @@ -318,6 +326,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in comm->rank = rank; comm->nRanks = ndev; + NCCLCHECK(ncclNetPluginLoad(comm)); NCCLCHECK(ncclNetInit(comm)); INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name); @@ -349,9 +358,6 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan); ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp); - ncclMemoryPoolConstruct(&comm->memPool_ncclPointerList); - ncclMemoryPoolConstruct(&comm->memPool_ncclNvlsHandleList); - ncclMemoryPoolConstruct(&comm->memPool_ncclCollnetHandleList); comm->groupNext = reinterpret_cast(0x1); comm->preconnectNext = reinterpret_cast(0x1); @@ -397,6 +403,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { int nRanks = comm->nRanks; struct ncclDevCommAndChannels tmpCommAndChans; struct ncclDevCommAndChannels *devCommAndChans = NULL; + struct ncclNvmlCCStatus ccStatus; NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail); NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); @@ -406,37 +413,51 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { tmpCommAndChans.comm.nRanks = nRanks; tmpCommAndChans.comm.node = comm->node; tmpCommAndChans.comm.nNodes = comm->nNodes; - tmpCommAndChans.comm.abortFlag = comm->abortFlag; + tmpCommAndChans.comm.abortFlag = comm->abortFlagDev; for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) { tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p]; } tmpCommAndChans.comm.p2pChunkSize = comm->p2pChunkSize; tmpCommAndChans.comm.channels = &devCommAndChans->channels[0]; - comm->workFifoDepth = ncclParamWorkFifoDepth(); - if (0 != (comm->workFifoDepth & (comm->workFifoDepth-1))) { - WARN("NCCL_WORK_FIFO_DEPTH=%d is being ignored because it is not a power of 2.", comm->workFifoDepth); - comm->workFifoDepth = 64<<10; + comm->workArgsBytes = std::min(ncclParamWorkArgsBytes(), ncclMaxKernelArgsSize(comm->cudaArch)); + + memset(&ccStatus, 0, sizeof(ccStatus)); + if (ncclNvmlGetCCStatus(&ccStatus) == ncclSuccess && ccStatus.CCEnabled) { + comm->workFifoBytes = 0; + if (ccStatus.multiGpuCCEnabled == false && comm->rank == 0) { + WARN("CC On, Multi-GPU CC Off (No inter-GPU communication protection)"); + } + } else { + comm->workFifoBytes = ncclParamWorkFifoBytes(); + if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) { + WARN("NCCL_WORK_FIFO_BYTES=%d is being ignored because it is not a power of 2.", comm->workFifoBytes); + comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT; + } + comm->workFifoBytes = std::min(comm->workFifoBytes, 1u<<30); + } + + if (comm->rank == 0) { + INFO(NCCL_INIT, "CC %s, Multi-GPU CC %s, workFifoBytes %d", ccStatus.CCEnabled ? "On" : "Off", ccStatus.multiGpuCCEnabled ? "On" : "Off", comm->workFifoBytes); } - tmpCommAndChans.comm.workFifoDepth = comm->workFifoDepth; if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) { - // The workFifoHeap lives in GDR mapped CUDA memory. - NCCLCHECKGOTO(ncclGdrCudaCalloc(&comm->workFifoHeap, &comm->devWorkFifoHeap, comm->workFifoDepth, &comm->workFifoHeapGdrHandle), ret, fail); - ncclCommPushCudaGdrFree(comm, comm->workFifoHeapGdrHandle); + // The workFifoBuf lives in GDR mapped CUDA memory. + NCCLCHECKGOTO(ncclGdrCudaCalloc(&comm->workFifoBuf, &comm->workFifoBufDev, comm->workFifoBytes, &comm->workFifoBufGdrHandle), ret, fail); + ncclCommPushCudaGdrFree(comm, comm->workFifoBufGdrHandle); } else { - // The workFifoHeap lives in cudaHost memory. - comm->workFifoHeapGdrHandle = nullptr; - NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoHeap, comm->workFifoDepth), ret, fail); - ncclCommPushCudaHostFree(comm, comm->workFifoHeap); - comm->devWorkFifoHeap = comm->workFifoHeap; + // The workFifoBuf lives in cudaHost memory. + comm->workFifoBufGdrHandle = nullptr; + NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoBuf, comm->workFifoBytes), ret, fail); + ncclCommPushCudaHostFree(comm, comm->workFifoBuf); + comm->workFifoBufDev = comm->workFifoBuf; } - tmpCommAndChans.comm.workFifoHeap = comm->devWorkFifoHeap; - NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoDone, MAXCHANNELS), ret, fail); - ncclCommPushCudaHostFree(comm, comm->workFifoDone); - comm->workFifoSent = 0; - comm->workFifoAckdMin = 0; + NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoConsumed, MAXCHANNELS), ret, fail); + ncclCommPushCudaHostFree(comm, comm->workFifoConsumed); + comm->workFifoProduced = 0; + comm->workFifoConsumedLeast = 0; + tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed; if (comm->collNetDenseToUserRank != nullptr) { NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); @@ -452,7 +473,6 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain; tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect; tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls; - tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c]; if (comm->channels[c].ring.userRanks != nullptr) { NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); @@ -471,13 +491,10 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { // Pre-process the string so that running "strings" on the lib can quickly reveal the version. #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR) static void showVersion() { - static int shown = 0; - if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) { - printf("%s\n", VERSION_STRING); - fflush(stdout); - if (ncclDebugFile != stdout) - INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files - shown = 1; + if (ncclDebugLevel == NCCL_LOG_VERSION || ncclDebugLevel == NCCL_LOG_WARN) { + VERSION("%s", VERSION_STRING); + } else { + INFO(NCCL_ALL,"%s", VERSION_STRING); } } @@ -487,6 +504,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->nvmlDev = comm->nvmlDev; info->hostHash=getHostHash()+commHash; info->pidHash=getPidHash()+commHash; + info->cuMemSupport = ncclCuMemEnable(); // Get the device MAJOR:MINOR of /dev/shm so we can use that // information to decide whether we can use SHM for inter-process @@ -584,244 +602,6 @@ NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2); NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1); NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0); -static ncclResult_t collNetInitRailRankMap(ncclComm_t comm) { - int rank = comm->rank; - uint64_t nonHeadMask = (1ull << comm->localRanks) - 1; - - comm->collNetDenseToUserRank = ncclMemoryStackAlloc(&comm->memPermanent, comm->nRanks); - comm->collNetUserToDenseRank = ncclMemoryStackAlloc(&comm->memPermanent, comm->nRanks); - // initialize collNetUserToDenseRank[rank] - comm->collNetUserToDenseRank[rank] = -1; - for (int h = 0; h < comm->collNetHeadsNum; h++) { - nonHeadMask ^= 1ull << comm->rankToLocalRank[comm->collNetHeads[h]]; - if (comm->collNetHeads[h] == rank) { comm->collNetUserToDenseRank[rank] = h; break; } - } - if (comm->collNetUserToDenseRank[rank] == -1) { - comm->collNetUserToDenseRank[rank] = __builtin_popcountll(nonHeadMask & ((1ull << comm->localRank) - 1)); - } - comm->collNetUserToDenseRank[rank] += comm->node * comm->localRanks; - - NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->collNetUserToDenseRank, sizeof(int))); - for (int r = 0; r < comm->nRanks; r++) { - comm->collNetDenseToUserRank[comm->collNetUserToDenseRank[r]] = r; - } - return ncclSuccess; -} - -static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* collNetGraph) { - ncclResult_t ret = ncclSuccess; - int rank = comm->rank; - int collNetSetupFail = 0; - int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_P2P }; - // Find all head ranks - int nHeadsUnique = 0; - int* headsUnique = NULL; - int highestTransportType0, highestTransportType1; - char line[1024]; - bool share; - - struct collnetShareInfo { - int headPosition; - int isMaster; - }; - struct collnetShareInfo* infos = NULL; - - NCCLCHECKGOTO(ncclCalloc(&headsUnique, collNetGraph->nChannels), ret, fail); - { uint64_t mask = 0; - // Head GPU index is always 0 - for (int c = 0; c < collNetGraph->nChannels; c++) { - int head = collNetGraph->intra[c * comm->localRanks + 0]; - assert(comm->rankToNode[head] == comm->node); - uint64_t mask0 = mask; - mask |= 1ull<rankToLocalRank[head]; - if (mask != mask0) headsUnique[nHeadsUnique++] = head; - } - } - - comm->collNetHeads = headsUnique; - comm->collNetHeadsNum = nHeadsUnique; - if (parent && parent->collNetSupport && parent->config.splitShare && parent->nNodes == comm->nNodes) { - NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail); - /* check whether child can share collnet resources of parent. Since parent builds each collnet communicator - * based on heads with the same head position in each node, as long as the collnet heads of child comm - * can match parent's heads, we can let child communicator share parent's collnet resources. */ - for (int h = 0; h < nHeadsUnique; ++h) { - int prev = INT_MIN; - struct collnetShareInfo* myinfo; - - share = true; - myinfo = infos + comm->rank; - memset(myinfo, 0, sizeof(struct collnetShareInfo)); - /* find the child head position in parent collnet heads. */ - if (headsUnique[h] == comm->rank) { - myinfo->headPosition = -1; - myinfo->isMaster = 1; - for (int th = 0; th < parent->collNetHeadsNum; ++th) - if (parent->topParentRanks[parent->collNetHeads[th]] == comm->topParentRanks[comm->rank]) { - myinfo->headPosition = th; - break; - } - } - - NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, infos, sizeof(struct collnetShareInfo)), ret, fail); - for (int i = 0; i < comm->nRanks; ++i) { - if (infos[i].isMaster) { - if (prev == INT_MIN) - prev = infos[i].headPosition; - - if (infos[i].headPosition == -1 || prev != infos[i].headPosition) { - share = false; - break; - } - } - } - - if (share) { - if (myinfo->isMaster) { - comm->collNetSharedRes = parent->collNetSharedRes; - for (int c = 0; c < comm->nChannels; ++c) - NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail); - } - - NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail); - } else { - /* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot - * share the sharp resource from parent, we cannot use sharp in this case. This restriction might be - * lifted by sharp plugin/IB hardware in the future. */ - collNetSetupFail = 1; - if (comm->rank == 0) { - WARN("Child comms (nRanks %d) fails to share parent comms (nRanks %d) sharp resources", comm->nRanks, parent->nRanks); - } - goto fail; - } - } - share = true; - } else { - /* this allocated buffer will be freed on proxy side */ - NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1)); - comm->collNetSharedRes->nChannels = comm->nChannels; - comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE]; - - NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail); - - for (int c = 0; c < comm->nChannels; c++) { - struct ncclChannel* channel = comm->channels + c; - NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail); - for (int h = 0; h < nHeadsUnique; h++) { - const int head = headsUnique[h]; - ncclConnect connect; - collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv, &connect); - if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend, &connect); - } - // Verify CollNet setup across ranks after trying the first channel - if (c == 0) { - NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); - } - } - share = false; - } - - if (share) { - memcpy(comm->collNetSupportMatrix, parent->collNetSupportMatrix, sizeof(comm->collNetSupportMatrix)); - } else { - do { - /* Initialize all entries in collNetSupportMatrix[redop][type]. Since some - ranks don't connect to sharp we enable a (redop,type) if any rank claims - support. */ - const ncclRedOp_t redops[] = {ncclSum, ncclProd, ncclMin, ncclMax}; - uint8_t(*matrix)[4][ncclNumTypes]; - bool isHead = false; - matrix = nullptr; - NCCLCHECKGOTO(ncclCalloc(&matrix, comm->nRanks), ret, matrix_end); - for (int h = 0; h < nHeadsUnique; h++) isHead |= (headsUnique[h] == comm->rank); - if (isHead) { - for (int ty=0; ty < ncclNumTypes; ty++) { - for (int i=0; i < 4; i++) { - int support = 0; - NCCLCHECKGOTO(collNetReduceSupport(comm, (ncclDataType_t)ty, redops[i], &support), ret, matrix_end); - // bit 0 = not supported, bit 1 = supported - matrix[rank][redops[i]][ty] = 1<<(support ? 1 : 0); - } - } - } - NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, matrix, sizeof(*matrix)), ret, matrix_end); - for (int ty=0; ty < ncclNumTypes; ty++) { - for (int i=0; i < 4; i++) { - int op = redops[i]; - uint8_t accum = 0; - for (int r=0; r < comm->nRanks; r++) accum |= matrix[r][op][ty]; - // We support (redop, type) if some rank supports it and no rank doesn't support it - comm->collNetSupportMatrix[op][ty] = (accum == (1<<1)); - } - } - matrix_end: - free(matrix); - if (ret != ncclSuccess) goto fail; - } while (0); - } - - // Verify CollNet setup across ranks after trying all channels - NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); - TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank); - - line[0] = '\0'; - for (int c = 0; c < comm->nChannels; c++) { - struct ncclTree* chain = &comm->channels[c].collnetChain; - snprintf(line + strlen(line), 1023 - strlen(line), " [%d] %d->%d->%d", - c, chain->down[0], rank, chain->up); - } - line[1023] = '\0'; - - INFO(NCCL_INIT, "Collnet Chains %s", line); - // Connect Collnet + chain - for (int c = 0; c < comm->nChannels; c++) { - struct ncclChannel* channel = comm->channels + c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, fail); - } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0), ret, fail); - for (int c = 0; c < comm->nChannels; c++) { - struct ncclChannel* channel = comm->channels + c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, fail); - } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1), ret, fail); - INFO(NCCL_INIT, "Connected collnet + chain"); - - // Connect intra-node CollNet + Direct - for (int c = 0; c < comm->nChannels; c++) { - struct ncclChannel* channelRecv = comm->channels + c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail); - } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0, &highestTransportType0), ret, fail); - - for (int c = 0; c < comm->nChannels; c++) { - struct ncclChannel* channelSend = comm->channels + c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail); - } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1, &highestTransportType1), ret, fail); - - // Exchange highest intra-node transport type among ranks - // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer - comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; - if (share) { - comm->intraHighestTransportType = std::max(comm->intraHighestTransportType, parent->intraHighestTransportType); - } - NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail); - for (int i = 0; i < comm->localRanks; i++) { - if (highestTypes[i] > comm->intraHighestTransportType) - comm->intraHighestTransportType = highestTypes[i]; - } - - INFO(NCCL_INIT, "rank %d Connected CollNet", rank); - -exit: - free(infos); - return ret; -fail: - ncclTransportCollNetFree(comm); - comm->collNetSupport = 0; - goto exit; -} - // MNNVL: Flag to indicate whether to enable Multi-Node NVLink NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2); @@ -890,7 +670,16 @@ static int checkMNNVL(struct ncclComm* comm) { } #endif -static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent = NULL) { +#define TIMER_INIT_TOTAL 0 +#define TIMER_INIT_KERNELS 1 +#define TIMER_INIT_BOOTSTRAP 2 +#define TIMER_INIT_ALLGATHER 3 +#define TIMER_INIT_TOPO 4 +#define TIMER_INIT_GRAPHS 5 +#define TIMER_INIT_CONNECT 6 +#define TIMERS_INIT_COUNT 7 + +static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) { // We use 2 AllGathers // 1. { peerInfo, comm, compCap} // 2. { nChannels, graphInfo, topoRanks } @@ -899,11 +688,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p int nranks = comm->nRanks; int nNodes = 1; cpu_set_t affinitySave; - struct ncclTopoGraph ringGraph; - struct ncclTopoGraph treeGraph; - struct ncclTopoGraph collNetGraph; - struct ncclTopoGraph nvlsGraph; - struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph }; + struct ncclTopoGraph* ringGraph = &comm->graphs[NCCL_ALGO_RING]; + struct ncclTopoGraph* treeGraph = &comm->graphs[NCCL_ALGO_TREE]; + struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN]; + struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT]; + struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS]; + struct ncclTopoGraph* graphs[] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph }; struct graphInfo { int pattern; @@ -919,6 +709,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p struct allGatherInfo { struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS]; struct ncclTopoRanks topoRanks; + int cpuArch; + int cpuVendor; }; int nChannelsOrig; @@ -932,13 +724,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p int *topParentLocalRanks = NULL; int tpProxyRank; + timers[TIMER_INIT_ALLGATHER] = clockNano(); // AllGather1 - begin NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail); + comm->cuMemSupport = 1; for (int i = 0; i < nranks; i++) { if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++; + if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0; if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) { WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId); ret = ncclInvalidUsage; @@ -946,6 +741,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } } // AllGather1 - end + timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER]; // MNNVL support if (nNodes > 1 && !checkMNNVL(comm) && ncclParamMNNVLEnable() == 1) { @@ -1008,6 +804,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p comm->intraBarrierGate = 0; } while(0); + timers[TIMER_INIT_TOPO] = clockNano(); // Topo detection / System graph creation NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail); // Compute paths between GPUs and NICs @@ -1018,8 +815,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail); // Init search NCCLCHECKGOTO(ncclTopoSearchInit(comm->topo), ret, fail); + // Decide on comm's CPU architecture. + NCCLCHECKGOTO(ncclTopoComputeCommCPU(comm), ret, fail); // Print final topology NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail); + timers[TIMER_INIT_TOPO] = clockNano() - timers[TIMER_INIT_TOPO]; // Set Affinity to a CPU local the our GPU, so that all memory we allocate // on the host is local. @@ -1043,51 +843,66 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p // Determine local Nvls support NCCLCHECK(ncclNvlsInit(comm)); + timers[TIMER_INIT_GRAPHS] = clockNano(); // Get rings and trees - memset(&ringGraph, 0, sizeof(struct ncclTopoGraph)); - ringGraph.id = 0; - ringGraph.pattern = NCCL_TOPO_PATTERN_RING; - ringGraph.minChannels = 1; - ringGraph.maxChannels = MAXCHANNELS/2; - NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail); - NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &ringGraph), ret, fail); - - memset(&treeGraph, 0, sizeof(struct ncclTopoGraph)); - treeGraph.id = 1; - treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE; - treeGraph.minChannels = ringGraph.nChannels; - treeGraph.maxChannels = ringGraph.nChannels; - NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &treeGraph), ret, fail); - NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &treeGraph), ret, fail); - - memset(&collNetGraph, 0, sizeof(struct ncclTopoGraph)); - collNetGraph.id = 2; - collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; - collNetGraph.collNet = 1; - collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels; + memset(ringGraph, 0, sizeof(struct ncclTopoGraph)); + ringGraph->id = 0; + ringGraph->pattern = NCCL_TOPO_PATTERN_RING; + ringGraph->minChannels = 1; + ringGraph->maxChannels = MAXCHANNELS/2; + NCCLCHECKGOTO(ncclTopoCompute(comm->topo, ringGraph), ret, fail); + NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, ringGraph), ret, fail); + + memset(treeGraph, 0, sizeof(struct ncclTopoGraph)); + treeGraph->id = 1; + treeGraph->pattern = NCCL_TOPO_PATTERN_BALANCED_TREE; + treeGraph->minChannels = ringGraph->nChannels; + treeGraph->maxChannels = ringGraph->nChannels; + NCCLCHECKGOTO(ncclTopoCompute(comm->topo, treeGraph), ret, fail); + NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, treeGraph), ret, fail); + + memset(collNetChainGraph, 0, sizeof(struct ncclTopoGraph)); + collNetChainGraph->id = 2; + collNetChainGraph->pattern = NCCL_TOPO_PATTERN_TREE; + collNetChainGraph->collNet = 1; + collNetChainGraph->minChannels = ringGraph->nChannels; + collNetChainGraph->maxChannels = ringGraph->nChannels; + + memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph)); + collNetDirectGraph->id = 2; + collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT; + collNetDirectGraph->collNet = 1; + collNetDirectGraph->minChannels = 1; + collNetDirectGraph->maxChannels = MAXCHANNELS; if (comm->collNetSupport) { - NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail); - NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail); + NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail); + NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail); + NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail); + NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetDirectGraph), ret, fail); } - memset(&nvlsGraph, 0, sizeof(struct ncclTopoGraph)); - nvlsGraph.id = 3; - nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS; - nvlsGraph.minChannels = 1; - nvlsGraph.maxChannels = MAXCHANNELS; + memset(nvlsGraph, 0, sizeof(struct ncclTopoGraph)); + nvlsGraph->id = 3; + nvlsGraph->pattern = NCCL_TOPO_PATTERN_NVLS; + nvlsGraph->minChannels = 1; + nvlsGraph->maxChannels = MAXCHANNELS; if (comm->nvlsSupport) { - NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail); - NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail); + NCCLCHECKGOTO(ncclTopoCompute(comm->topo, nvlsGraph), ret, fail); + NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, nvlsGraph), ret, fail); } + timers[TIMER_INIT_GRAPHS] = clockNano() - timers[TIMER_INIT_GRAPHS]; // Initialize num P2P LL buffers for this communicator comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1; if (comm->rank == ncclParamGraphDumpFileRank()) { - struct ncclTopoGraph* dumpGraphs[4] = { &ringGraph, &treeGraph, &collNetGraph, &nvlsGraph }; - NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 4, dumpGraphs), ret, fail); + struct ncclTopoGraph* dumpGraphs[5] = { ringGraph, treeGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph }; + NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 5, dumpGraphs), ret, fail); } + // Because timers[[TIMER_INIT_ALLGATHER] already contains the timing of the first allgather, + // we temporarily store the start time of the subsequent one in an as-of-yet unused CONNECT timer. + timers[TIMER_INIT_CONNECT] = clockNano(); // AllGather3 - begin NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail); @@ -1102,7 +917,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic; } - comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); + allGather3Data[rank].cpuArch = comm->cpuArch; + allGather3Data[rank].cpuVendor = comm->cpuVendor; + + comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels); NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail); @@ -1122,7 +940,28 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern; } comm->rankToNode[r] = node; + + if (comm->cpuArch != allGather3Data[r].cpuArch && + comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) { + comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED; + } + if (comm->cpuVendor != allGather3Data[r].cpuVendor && + comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) { + comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED; + } + } + + // Alert the user to the presence of mixed CPUs. In the past this has caused + // locks in some collective routines. This may help debug issues in the future. + if (rank==0) { + if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) { + INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected."); + } + if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) { + INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected."); + } } + // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail); NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail); @@ -1178,7 +1017,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0; if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0; - comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); + comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels); if (comm->nChannels < nChannelsOrig) { // We started duplicating channels during Preset(), so we need to move the // duplicated channels since we have removed some. @@ -1209,6 +1048,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail); NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail); // AllGather3 - end + timers[TIMER_INIT_ALLGATHER] += clockNano() - timers[TIMER_INIT_CONNECT]; TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels); @@ -1252,133 +1092,146 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } else { NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail); } + + timers[TIMER_INIT_CONNECT] = clockNano(); + do { // Build p2p schedule + int node = comm->node; + int nNodes = comm->nNodes; + int nRanks = comm->nRanks; + int local = comm->localRank; + int nLocals = comm->maxLocalRanks; + struct ncclNodeRanks* nodeRanks = comm->nodeRanks; + bool flat = false; + for (int node = 0; node < nNodes; node++) { + if (nodeRanks[node].localRanks != nLocals) { + flat = true; + nNodes = 1; node = 0; + nLocals = nRanks; local = rank; + break; + } + } + int nNodesPow2 = pow2Up(nNodes); + int nLocalsPow2 = pow2Up(nLocals); + comm->p2pSchedule = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); + comm->planner.peers = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); + uint32_t nodeRound = 0; + uint32_t nodeDelta = 0; + int round = 0; + // When enumerating peer deltas we use the quadratic formula (x*x+x)/2 mod N. + // Since that formula only produces valid permutations when N is a pow of 2, + // we let N = pow2Up(n) and filter out results greater-eq to n. + // Example sequence for 16 ranks: 0, 1, 3, 6, 10, 15, 5, 12, 4, 13, 7, 2, 14, 11, 9, 8 + do { + if (nodeDelta < nNodes) { // Filter nonsensical node deltas + int sendNode = (node + nodeDelta) % nNodes; + int recvNode = (node - nodeDelta + nNodes) % nNodes; + uint32_t localRound = 0; + uint32_t localDelta = 0; + do { + if (localDelta < nLocals) { // Filter nonsensical node-local deltas + int sendLocal = (local + localDelta) % nLocals; + int recvLocal = (local - localDelta + nLocals) % nLocals; + comm->p2pSchedule[round].sendRank = flat ? sendLocal : nodeRanks[sendNode].localRankToRank[sendLocal]; + comm->p2pSchedule[round].recvRank = flat ? recvLocal : nodeRanks[recvNode].localRankToRank[recvLocal]; + round += 1; + } + localRound += 1; + localDelta = (localDelta + localRound) & (nLocalsPow2 - 1); // Quadratic update + } while (localRound != nLocalsPow2); + } + nodeRound += 1; + nodeDelta = (nodeDelta + nodeRound) & (nNodesPow2 - 1); // Quadratic update + } while (nodeRound != nNodesPow2); - // Connect with prev/next for each ring - for (int c=0; cnChannels; c++) { - struct ncclChannel* channel = comm->channels+c; - NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail); - if (comm->nRanks == 1) continue; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail); - } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, fail); - INFO(NCCL_INIT, "Connected all rings"); + if (round != nRanks) { + WARN("P2p schedule creation has bugs."); + ret = ncclInternalError; + goto fail; + } + } while (0); - // Connect Trees - for (int c=0; cnChannels; c++) { - struct ncclChannel* channel = comm->channels+c; - if (comm->nRanks == 1) continue; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail); - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail); - } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail); - INFO(NCCL_INIT, "Connected all trees"); - - // Setup NVLS - NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail); - // And NVLS trees if needed - if (comm->nvlsSupport && comm->nNodes > 1) { + comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect(); + if (comm->runtimeConn) { for (int c=0; cnChannels; c++) { - struct ncclChannel* channel = comm->channels+c; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail); - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail); + NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail); } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &nvlsGraph, 0), ret, fail); - INFO(NCCL_INIT, "Connected NVLS tree"); - } + // Setup NVLS + NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail); + // Check if we can setup CollNet + if (comm->collNetSupport > 0) ncclCollNetSetup(comm, parent, graphs); + } else { + for (int c=0; cnChannels; c++) { + NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail); + } + NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail); - // Check if we can setup CollNet - if (comm->collNetSupport > 0) collNetTrySetup(comm, parent, &collNetGraph); + // Connect Trees + NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail); - TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); + // Setup NVLS + NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail); + NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail); - // Compute time models for algorithm and protocol combinations - NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail); + // And NVLS trees if needed + NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail); - INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); + // Check if we can setup CollNet + if (comm->collNetSupport > 0) { + ncclCollNetSetup(comm, parent, graphs); + NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail); + NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail); + } - do { // Setup p2p structures in comm->tasks - struct ncclTasks* tasks = &comm->tasks; - int node = comm->node; - int nNodes = comm->nNodes; - struct ncclNodeRanks *nodeRanks = comm->nodeRanks; - int localRank = comm->localRank; - // We want to fuse along node boundaries. Make sure nsteps is a multiple or divides 8. - int steps = ALIGN_POWER(comm->maxLocalRanks, NCCL_MAX_WORK_ELEMENTS_P2P/2); - tasks->p2pOrderSteps = comm->nNodes * steps; - tasks->peers = ncclMemoryStackAlloc(&comm->memPermanent, tasks->p2pOrderSteps); - tasks->p2pSendOrder = ncclMemoryStackAlloc(&comm->memPermanent, tasks->p2pOrderSteps); - tasks->p2pRecvOrder = ncclMemoryStackAlloc(&comm->memPermanent, tasks->p2pOrderSteps); - int i=0; - // schedule delta 0, +1, -1, +2, -2, ... - // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even. - for (int d=0; d <= nNodes/4; d++) { - int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes }; - int index = 0; - int delta = deltas[index]; - sched_delta: - int recvNode = (node+nNodes-delta)%nNodes; - int sendNode = (node+delta)%nNodes; - for (int step=0; step < steps; step++) { - int recvIndex = (localRank-step+steps)%steps; - int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1; - tasks->p2pRecvOrder[i] = recvRank; - int sendIndex = (localRank+step)%steps; - int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1; - tasks->p2pSendOrder[i] = sendRank; - i++; - } - index++; - if (index == 1 && deltas[1] == deltas[0]) index++; - if (index == 2 && deltas[2] == deltas[0]) index++; - if (index == 3 && deltas[3] == deltas[2]) index++; - if (index == 3 && deltas[3] == deltas[1]) index++; - if (index < 4) { - delta = deltas[index]; - goto sched_delta; + // Connect to local net proxy + tpProxyRank = comm->topParentRanks[comm->rank]; + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); + NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); + + // Then to remote ones when using PXN + if (ncclPxnDisable(comm) == 0) { + int nranks; + NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail); + for (int r=0; rtopParentRanks[pxnPeers[r]]; + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); + NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); } } - assert(i == tasks->p2pOrderSteps); - } while (0); - if (ncclParamNvbPreconnect()) { - // Connect p2p when using NVB path - int nvbNpeers; - NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail); - for (int r=0; rp2pnChannelsPerPeer; c++) { - NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId), ret, fail); - if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { - comm->connectSend[peer] |= (1UL<p2pnChannelsPerPeer; c++) { - NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId), ret, fail); - if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { - comm->connectRecv[peer] |= (1UL<topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail); + for (int r=0; rp2pSchedule[sendRound].sendRank != peer) sendRound++; + while (comm->p2pSchedule[recvRound].recvRank != peer) recvRound++; + uint8_t sendBase = ncclP2pChannelBaseForRound(comm, sendRound); + uint8_t recvBase = ncclP2pChannelBaseForRound(comm, recvRound); + for (int c=0; cp2pnChannelsPerPeer; c++) { + int channelId; + channelId = ncclP2pChannelForPart(comm->p2pnChannels, sendBase, c); + if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { + comm->connectSend[peer] |= (1UL<p2pnChannels, recvBase, c); + if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { + comm->connectRecv[peer] |= (1UL<topParentRanks[comm->rank]; - NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); - NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); + TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); - // Then to remote ones when using PXN - if (ncclPxnDisable(comm) == 0) { - int nranks; - NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail); - for (int r=0; rtopParentRanks[pxnPeers[r]]; - NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); - NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); - } - } + // Compute time models for algorithm and protocol combinations + NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail); + + INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); if (comm->intraRank == 0) { // Load ncclParamLaunchMode const char* str = ncclGetEnv("NCCL_LAUNCH_MODE"); @@ -1399,6 +1252,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock. NCCLCHECKGOTO(devCommSetup(comm), ret, fail); + timers[TIMER_INIT_CONNECT] = clockNano() - timers[TIMER_INIT_CONNECT]; /* Local intra-node barrier */ NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); @@ -1412,7 +1266,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be * properly cleaned up. */ - if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess) ncclProxyShmUnlink(comm); + if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm); free(allTopoRanks); free(nodesTreePatterns); free(nodesFirstRank); @@ -1507,20 +1361,25 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { int cudaDev = job->cudaDev; int* parentRanks = NULL; int cudaArch; + uint64_t timers[TIMERS_INIT_COUNT]; + timers[TIMER_INIT_TOTAL] = clockNano(); CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail); CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail); CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail); cudaArch = 100*archMajor + 10*archMinor; + timers[TIMER_INIT_KERNELS] = clockNano(); NCCLCHECK(ncclInitKernelsForDevice(cudaArch, &maxLocalSizeBytes)); // Set the maximum kernel stack size of all kernels to avoid // a CUDA memory reconfig on load (c.f. NVSHMEM issue) if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) { - TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes); + TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zu", maxLocalSizeBytes); CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes)); } + timers[TIMER_INIT_KERNELS] = clockNano() - timers[TIMER_INIT_KERNELS]; + timers[TIMER_INIT_BOOTSTRAP] = clockNano(); if (job->parent) { NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail); NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail); @@ -1533,6 +1392,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail); NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail); } + timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP]; comm->cudaArch = cudaArch; comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); @@ -1545,15 +1405,16 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId)); } - NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail); + NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail); - NCCLCHECKGOTO(ncclTunerPluginLoad(&comm->tuner), res, fail); + NCCLCHECKGOTO(ncclTunerPluginLoad(comm), res, fail); if (comm->tuner) { NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog, &comm->tunerContext)); } // update communicator state comm->initState = ncclSuccess; + timers[TIMER_INIT_TOTAL] = clockNano() - timers[TIMER_INIT_TOTAL]; // Trace this call for replay tool if (job->parent) { @@ -1573,6 +1434,9 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId)); } + INFO(NCCL_INIT|NCCL_PROFILE,"Init timings: rank %d nranks %d total %.2f (kernels %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, connections %.2f, rest %.2f)", comm->rank, comm->nRanks, timers[TIMER_INIT_TOTAL]/1e9, + timers[TIMER_INIT_KERNELS]/1e9, timers[TIMER_INIT_BOOTSTRAP]/1e9, timers[TIMER_INIT_ALLGATHER]/1e9, timers[TIMER_INIT_TOPO]/1e9, timers[TIMER_INIT_GRAPHS]/1e9, timers[TIMER_INIT_CONNECT]/1e9, + (timers[TIMER_INIT_TOTAL]-timers[TIMER_INIT_KERNELS]-timers[TIMER_INIT_BOOTSTRAP]-timers[TIMER_INIT_ALLGATHER]-timers[TIMER_INIT_TOPO]-timers[TIMER_INIT_GRAPHS]-timers[TIMER_INIT_CONNECT])/1e9); exit: if (job->newcomm) { /* assign it to user pointer. */ @@ -1658,7 +1522,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { } if (comm->config.splitShare != 1 && comm->config.splitShare != 0) { - WARN("splitShare %d is not a valid value 0/1, set it to 0\n", comm->config.splitShare); + WARN("splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare); comm->config.splitShare = 0; } @@ -1679,6 +1543,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { ncclConfig_t *internalConfigPtr; size_t realSize; + internalConfig.magic = 0; internalConfigPtr = &internalConfig; if (config) { memcpy((void*)&realSize, (void*)config, sizeof(size_t)); @@ -1767,8 +1632,10 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni } NCCLCHECKGOTO(ncclInit(), res, fail); - if (myrank == 0) showVersion(); - + if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) { + static pthread_once_t once = PTHREAD_ONCE_INIT; + pthread_once(&once, showVersion); + } // Make sure the CUDA runtime is initialized. CUDACHECKGOTO(cudaFree(NULL), res, fail); @@ -1781,9 +1648,10 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni } NCCLCHECKGOTO(ncclCalloc(&comm, 1), res, fail); + NCCLCHECKGOTO(ncclCalloc(&comm->abortFlag, 1), res, fail); + NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->abortFlagDev, 1), res, fail); + NCCLCHECKGOTO(ncclCalloc(&comm->abortFlagRefCount, 1), res, fail); comm->startMagic = comm->endMagic = NCCL_MAGIC; // Used to detect comm corruption. - NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1), res, fail); - NCCLCHECKGOTO(ncclCalloc((uint32_t**)&comm->abortFlagRefCount, 1), res, fail); *comm->abortFlagRefCount = 1; NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail); /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */ @@ -1802,8 +1670,9 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni return ncclGroupErrCheck(res); fail: if (comm) { - if (comm->abortFlag) ncclCudaHostFree((void *)comm->abortFlag); - if (comm->abortFlagRefCount) free(comm->abortFlagRefCount); + free(comm->abortFlag); + if (comm->abortFlagDev) ncclCudaHostFree((void*)comm->abortFlagDev); + free(comm->abortFlagRefCount); free(comm); } if (newcomm) *newcomm = NULL; @@ -1951,18 +1820,21 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { if (comm->initState == ncclSuccess) { NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), ret, fail); NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), ret, fail); + NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail); + // And keep polling until all graphs referencing us die. + while (comm->persistentRefs != 0) { + NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail); + } } - NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail); - // And keep polling until all graphs referencing us die. - while (comm->persistentRefs != 0) { - NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail); + + if ((ret = ncclProxyStop(comm)) != ncclSuccess) { + WARN("ncclProxyStop: comm %p (rank = %d) destroys proxy resource error %d", comm, comm->rank, ret); } if (savedDevice != commDevice) { CUDACHECKGOTO(cudaSetDevice(savedDevice), ret, fail); } - comm->finalizeCalled = true; exit: return ret; fail: @@ -1980,7 +1852,7 @@ static ncclResult_t commCleanup(ncclComm_t comm) { if (comm->tuner != NULL) { NCCLCHECK(comm->tuner->destroy(comm->tunerContext)); - NCCLCHECK(ncclTunerPluginUnload(&comm->tuner)); + NCCLCHECK(ncclTunerPluginUnload(comm)); } NCCLCHECK(commFree(comm)); @@ -1992,31 +1864,11 @@ static ncclResult_t commCleanup(ncclComm_t comm) { return ncclSuccess; } -static ncclResult_t commFinalize(ncclComm_t comm, bool userCalled) { - ncclResult_t ret = ncclSuccess; - struct ncclCommFinalizeAsyncJob *job = NULL; - - /* launch async thread to finalize comm. */ - NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); - job->comm = comm; - - if (userCalled) { - NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail); - } else { - NCCLCHECKGOTO(commDestroySync(&job->base), ret, fail); - free(job); - } - -exit: - return ncclGroupErrCheck(ret); -fail: - goto exit; -} - NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm); ncclResult_t ncclCommFinalize(ncclComm_t comm) { NVTX3_FUNC_RANGE_IN(nccl_domain); ncclResult_t ret = ncclSuccess; + struct ncclCommFinalizeAsyncJob *job = NULL; NCCLCHECK(ncclGroupStartInternal()); if (comm == NULL) goto exit; @@ -2030,8 +1882,11 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) { goto fail; } - /* finalize comm. */ - ret = commFinalize(comm, true); + comm->finalizeCalled = true; + /* launch async thread to finalize comm. */ + NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); + job->comm = comm; + NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail); exit: ncclGroupErrCheck(ret); @@ -2043,21 +1898,14 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) { goto exit; } -static ncclResult_t commReclaim(ncclComm_t comm) { +static ncclResult_t commReclaim(struct ncclAsyncJob* job_) { + struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_; + ncclComm_t comm = job->comm; ncclResult_t ret = ncclSuccess; - ncclResult_t state; - int curRank; /* Debug info */ - - NCCLCHECKGOTO(ncclCommGetAsyncError(comm, &state), ret, fail); - TRACE(NCCL_INIT, "commReclaim: reclaim comm %p rank %d state %d", comm, comm->rank, state); - if (state == ncclSuccess && __atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0 && comm->finalizeCalled == false) { - /* user does not call ncclCommFinalize and this is a normal comm destroy. ncclCommDestroy - * should be nonblocking until last call of ncclCommDestroy. */ - NCCLCHECKGOTO(commFinalize(comm, false), ret, fail); - } if (comm->intraComm0 != NULL) { int curRankCnt; + int curRank; /* Debug info */ int intraRanks = comm->intraRanks; ncclComm_t intracomm0 = comm->intraComm0; int *finalizeRankCnt = &intracomm0->finalizeRankCnt; @@ -2080,30 +1928,7 @@ static ncclResult_t commReclaim(ncclComm_t comm) { job.comm = curIntraComm; /* every comm aborts, commDestroySync should not be blocked. */ if ((ret = commDestroySync((struct ncclAsyncJob*) &job)) != ncclSuccess) - WARN("commReclaim: comm %p (rank = %d) in abort, error %d", curIntraComm, curRank, ret); - } - } - - /* ncclProxyStop() loop must be put after commDestroySync() loop. Namely, you cannot do: - * while(...) { - * commDestroySync(...); - * ncclProxyStop(...); - * } - * Considering one process multi-gpu case, we must guarantee all kernels are complete before - * we free proxy resources; otherwise, we will face invalid memory issues where proxy connection - * and related intermediate memory from one rank are freed but other ranks are still using it. - * This is not a problem for multi-process case, since intermediate memory is opened by CUDA IPC - * or mmap where memory free is guarded by CUDA driver and operating system, so we will not have - * invalid memory access issue. */ - nextIntraComm = intracomm0; - while (nextIntraComm) { - curIntraComm = nextIntraComm; - curRank = curIntraComm->rank; - nextIntraComm = nextIntraComm->intraNext; - - /* free intraprocess proxy resources. */ - if ((ret = ncclProxyStop(curIntraComm)) != ncclSuccess) { - WARN("commReclaim: comm %p (rank = %d) destroys proxy resource error %d", curIntraComm, curRank, ret); + WARN("commReclaim: comm %p (rank = %d) in commDestroySync, error %d", curIntraComm, curRank, ret); } } @@ -2121,10 +1946,7 @@ static ncclResult_t commReclaim(ncclComm_t comm) { } } -exit: return ret; -fail: - goto exit; } NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); @@ -2135,25 +1957,31 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { } int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; + struct ncclCommFinalizeAsyncJob *job = NULL; + ncclResult_t res = ncclSuccess; NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload) - int64_t busId = comm->busId; - TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId); + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); // Try and prevent a double free of the comm struct (user error) if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) { WARN("comm %p has already been destroyed", comm); return ncclInvalidArgument; } + comm->destroyFlag = 1; /* init thread must be joined before we destroy the comm. */ NCCLCHECK(ncclCommEnsureReady(comm)); + NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); + job->comm = comm; + NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail); - NCCLCHECK(commReclaim(comm)); - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Destroy COMPLETE", comm, rank, nranks, cudaDev, busId); - - return ncclSuccess; +exit: + return res; +fail: + free(job); + goto exit; } NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); @@ -2163,29 +1991,36 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { return ncclSuccess; } - volatile uint32_t* childAbortFlag; int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; + struct ncclCommFinalizeAsyncJob *job = NULL; + ncclResult_t res = ncclSuccess; NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload) - int64_t busId = comm->busId; - TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId); + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); // Ask anything that might still be running on the device to quit - childAbortFlag = __atomic_load_n(&comm->childAbortFlag, __ATOMIC_ACQUIRE); - if (childAbortFlag != NULL) { - __atomic_store_n(childAbortFlag, 1, __ATOMIC_RELAXED); + if (comm->childAbortFlag != nullptr) { + __atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE); + __atomic_store_n(comm->childAbortFlagDev, 1, __ATOMIC_RELEASE); } - __atomic_store_n(comm->abortFlag, 1, __ATOMIC_RELAXED); + __atomic_store_n(comm->abortFlag, 1, __ATOMIC_RELEASE); + __atomic_store_n(comm->abortFlagDev, 1, __ATOMIC_RELEASE); + comm->destroyFlag = 1; /* init thread must be joined before we destroy the comm, * and we should ignore the init error here. */ ncclCommEnsureReady(comm); - (void) commReclaim(comm); - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Abort COMPLETE", comm, rank, nranks, cudaDev, busId); + NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); + job->comm = comm; + NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail); +exit: return ncclSuccess; +fail: + free(job); + goto exit; } NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config); @@ -2208,14 +2043,17 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc childComm->startMagic = childComm->endMagic = NCCL_MAGIC; if (comm->config.splitShare) { childComm->abortFlag = comm->abortFlag; + childComm->abortFlagDev = comm->abortFlagDev; childComm->abortFlagRefCount = comm->abortFlagRefCount; comm->childAbortFlag = NULL; ncclAtomicRefCountIncrement(comm->abortFlagRefCount); } else { - NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&childComm->abortFlag, 1), res, fail); - NCCLCHECKGOTO(ncclCalloc((uint32_t**)&childComm->abortFlagRefCount, 1), res, fail); + NCCLCHECKGOTO(ncclCalloc(&childComm->abortFlag, 1), res, fail); + NCCLCHECKGOTO(ncclCudaHostCalloc(&childComm->abortFlagDev, 1), res, fail); + NCCLCHECKGOTO(ncclCalloc(&childComm->abortFlagRefCount, 1), res, fail); /* temporarily used to abort everything during child comm init. */ comm->childAbortFlag = childComm->abortFlag; + comm->childAbortFlagDev = childComm->abortFlagDev; *childComm->abortFlagRefCount = 1; } if (config == NULL) { @@ -2244,8 +2082,9 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc fail: if (childComm) { if (comm && !comm->config.splitShare) { - if (childComm->abortFlag) ncclCudaHostFree((void*)childComm->abortFlag); - if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount); + free(childComm->abortFlag); + if (childComm->abortFlagDev) ncclCudaHostFree(childComm->abortFlagDev); + free(childComm->abortFlagRefCount); } free(childComm); } diff --git a/src/init_nvtx.cc b/src/init_nvtx.cc index 44face681..1cb1277d2 100644 --- a/src/init_nvtx.cc +++ b/src/init_nvtx.cc @@ -2,11 +2,11 @@ #include "nvtx.h" static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = { - {"Sum", ncclSum}, - {"Product", ncclProd}, - {"Max", ncclMax}, - {"Min", ncclMin}, - {"Avg", ncclAvg} + {"Sum", ncclSum, 0}, + {"Product", ncclProd, 0}, + {"Max", ncclMax, 0}, + {"Min", ncclMin, 0}, + {"Avg", ncclAvg, 0} }; // Must be called before the first call to any reduction operation. @@ -19,7 +19,8 @@ void initNvtxRegisteredEnums() { .entries = NvtxEnumRedSchema, .numEntries = std::extent::value, .sizeOfEnum = sizeof(ncclRedOp_t), - .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP + .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP, + .extension = nullptr }; nvtxPayloadEnumRegister(nvtx3::domain::get(), &eAttr); diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc index 59023ae79..6ed5db27a 100644 --- a/src/misc/argcheck.cc +++ b/src/misc/argcheck.cc @@ -52,8 +52,6 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) { WARN("%s : invalid type %d", info->opName, info->datatype); return ncclInvalidArgument; } - // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars. - NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks)); if (info->op < 0 || ncclMaxRedOp < info->op) { WARN("%s : invalid reduction operation %d", info->opName, info->op); diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc index 6f5badfd8..d44c06355 100644 --- a/src/misc/cudawrap.cc +++ b/src/misc/cudawrap.cc @@ -59,6 +59,10 @@ DECLARE_CUDA_PFN(cuGetErrorString); DECLARE_CUDA_PFN(cuGetErrorName); /* enqueue.cc */ DECLARE_CUDA_PFN(cuMemGetAddressRange); +DECLARE_CUDA_PFN(cuLaunchKernel); +#if CUDA_VERSION >= 11080 +DECLARE_CUDA_PFN(cuLaunchKernelEx); +#endif /* proxy.cc */ DECLARE_CUDA_PFN(cuCtxCreate); DECLARE_CUDA_PFN(cuCtxDestroy); @@ -137,6 +141,10 @@ static ncclResult_t cudaPfnFuncLoader(void) { LOAD_SYM(cuCtxGetCurrent, 1); LOAD_SYM(cuCtxSetCurrent, 1); LOAD_SYM(cuCtxGetDevice, 1); + LOAD_SYM(cuLaunchKernel, 1); +#if CUDA_VERSION >= 11080 + LOAD_SYM(cuLaunchKernelEx, 1); +#endif /* cuMem API support */ LOAD_SYM(cuMemAddressReserve, 1); LOAD_SYM(cuMemAddressFree, 1); diff --git a/src/misc/gdrwrap.cc b/src/misc/gdrwrap.cc index 4729efe2e..3b46759c6 100644 --- a/src/misc/gdrwrap.cc +++ b/src/misc/gdrwrap.cc @@ -130,7 +130,7 @@ ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint6 int ret; GDRLOCKCALL(gdr_internal_pin_buffer(g, addr, size, p2p_token, va_space, handle), ret); if (ret != 0) { - WARN("gdr_pin_buffer(addr %lx, size %zi) failed: %d", addr, size, ret); + WARN("gdr_pin_buffer(addr %lx, size %zu) failed: %d", addr, size, ret); return ncclSystemError; } return ncclSuccess; @@ -172,7 +172,7 @@ ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) { int ret; GDRLOCKCALL(gdr_internal_map(g, handle, va, size), ret); if (ret != 0) { - WARN("gdr_map(handle %lx, size %zi) failed: %d", handle.h, size, ret); + WARN("gdr_map(handle %lx, size %zu) failed: %d", handle.h, size, ret); return ncclSystemError; } return ncclSuccess; @@ -186,7 +186,7 @@ ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) { int ret; GDRLOCKCALL(gdr_internal_unmap(g, handle, va, size), ret); if (ret != 0) { - WARN("gdr_unmap(handle %lx, va %p, size %zi) failed: %d", handle.h, va, size, ret); + WARN("gdr_unmap(handle %lx, va %p, size %zu) failed: %d", handle.h, va, size, ret); return ncclSystemError; } return ncclSuccess; @@ -218,7 +218,7 @@ ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const vo int ret; GDRLOCKCALL(gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size), ret); if (ret != 0) { - WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zi) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret); + WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zu) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret); return ncclSystemError; } return ncclSuccess; @@ -232,7 +232,7 @@ ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void int ret; GDRLOCKCALL(gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size), ret); if (ret != 0) { - WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zi) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret); + WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zu) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret); return ncclSystemError; } return ncclSuccess; diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc index fc7fd4b66..db61b3149 100644 --- a/src/misc/ipcsocket.cc +++ b/src/misc/ipcsocket.cc @@ -132,7 +132,7 @@ ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, WARN("UDS: Receiving data over socket failed : %d", errno); return ncclSystemError; } - if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError; + if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError; } if (recvFd != NULL) { @@ -221,7 +221,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno); return ncclSystemError; } - if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError; + if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError; } return ncclSuccess; diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc index 76c989e76..a2b0be0df 100644 --- a/src/misc/nvmlwrap.cc +++ b/src/misc/nvmlwrap.cc @@ -41,11 +41,19 @@ namespace { NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values)) // MNNVL support NCCL_NVML_FN(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t, (nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo)) + // CC support + NCCL_NVML_FN(nvmlSystemGetConfComputeState, nvmlReturn_t, (nvmlConfComputeSystemState_t *state)); + NCCL_NVML_FN(nvmlSystemGetConfComputeSettings, nvmlReturn_t, (nvmlSystemConfComputeSettings_t *setting)); std::mutex lock; // NVML has had some thread safety bugs bool initialized = false; thread_local bool threadInitialized = false; ncclResult_t initResult; + + union nvmlCCInfoInternal { + nvmlConfComputeSystemState_t settingV12020; + nvmlSystemConfComputeSettings_t settingV12040; + }; } ncclResult_t ncclNvmlEnsureInitialized() { @@ -87,6 +95,9 @@ ncclResult_t ncclNvmlEnsureInitialized() { {(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"}, // MNNVL support {(void**)&pfn_nvmlDeviceGetGpuFabricInfoV, "nvmlDeviceGetGpuFabricInfoV"}, + // CC support + {(void**)&pfn_nvmlSystemGetConfComputeState, "nvmlSystemGetConfComputeState"}, + {(void**)&pfn_nvmlSystemGetConfComputeSettings, "nvmlSystemGetConfComputeSettings"} }; for(Symbol sym: symbols) { *sym.ppfn = dlsym(libhandle, sym.name); @@ -282,3 +293,33 @@ ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricI NVMLTRY(nvmlDeviceGetGpuFabricInfoV, device, gpuFabricInfo); return ncclSuccess; } + +ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + std::lock_guard locked(lock); + nvmlCCInfoInternal ccInfo; + if (pfn_nvmlSystemGetConfComputeSettings != NULL) { + ccInfo.settingV12040.version = nvmlSystemConfComputeSettings_v1; + NVMLTRY(nvmlSystemGetConfComputeSettings, &ccInfo.settingV12040); + if (ccInfo.settingV12040.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED) + status->CCEnabled = true; + else + status->CCEnabled = false; + + if (ccInfo.settingV12040.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE) + status->multiGpuCCEnabled = true; + else + status->multiGpuCCEnabled = false; + } else if (pfn_nvmlSystemGetConfComputeState != NULL) { + NVMLTRY(nvmlSystemGetConfComputeState, &ccInfo.settingV12020); + if (ccInfo.settingV12020.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED) + status->CCEnabled = true; + else + status->CCEnabled = false; + status->multiGpuCCEnabled = false; + } else { + status->CCEnabled = false; + status->multiGpuCCEnabled = false; + } + return ncclSuccess; +} diff --git a/src/misc/param.cc b/src/misc/param.cc index e0b6ab821..2248be980 100644 --- a/src/misc/param.cc +++ b/src/misc/param.cc @@ -84,4 +84,4 @@ const char *ncclGetEnv(const char *name) { static pthread_once_t once = PTHREAD_ONCE_INIT; pthread_once(&once, initEnv); return getenv(name); -} \ No newline at end of file +} diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc index 04f7c10be..a48164337 100644 --- a/src/misc/shmutils.cc +++ b/src/misc/shmutils.cc @@ -63,13 +63,28 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de * goes down to 0, unlink should be called in order to delete shared memory file. */ if (shmPath[0] == '\0') { sprintf(shmPath, "/dev/shm/nccl-XXXXXX"); + retry_mkstemp: fd = mkstemp(shmPath); + if (fd < 0) { + if (errno == EINTR) { + INFO(NCCL_ALL, "mkstemp: Failed to create %s, error: %s (%d) - retrying", shmPath, strerror(errno), errno); + goto retry_mkstemp; + } + WARN("Error: failed to create shared memory file %p, error %s (%d)", shmPath, strerror(errno), errno); + ret = ncclSystemError; + goto fail; + } } else { SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail); } + retry_fallocate: if (fallocate(fd, 0, 0, realShmSize) != 0) { - WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize); + if (errno == EINTR) { + INFO(NCCL_ALL, "fallocate: Failed to extend %s to %ld bytes, error: %s (%d) - retrying", shmPath, realShmSize, strerror(errno), errno); + goto retry_fallocate; + } + WARN("Error: failed to extend %s to %ld bytes, error: %s (%d)", shmPath, realShmSize, strerror(errno), errno); ret = ncclSystemError; goto fail; } @@ -80,7 +95,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (hptr == MAP_FAILED) { - WARN("Could not map %s size %zi, error: %s", shmPath, realShmSize, strerror(errno)); + WARN("Error: Could not map %s size %zu, error: %s (%d)", shmPath, realShmSize, strerror(errno), errno); ret = ncclSystemError; hptr = NULL; goto fail; @@ -93,7 +108,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de if (remref == 0) { /* the last peer has completed attachment, it should unlink the shm mem file. */ if (unlink(shmPath) != 0) { - WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno)); + INFO(NCCL_ALLOC, "unlink shared memory %s failed, error: %s (%d)", shmPath, strerror(errno), errno); } } } @@ -110,7 +125,8 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de *handle = (ncclShmHandle_t)tmphandle; return ret; fail: - WARN("Error while %s shared memory segment %s (size %ld)", create ? "creating" : "attaching to", shmPath, shmSize); + WARN("Error while %s shared memory segment %s (size %ld), error: %s (%d)", create ? "creating" : "attaching to", + shmPath, shmSize, strerror(errno), errno); if (tmphandle) { shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle); ncclShmClose((ncclShmHandle_t)tmphandle); @@ -129,7 +145,7 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) { close(tmphandle->fd); if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) { if (unlink(tmphandle->shmPath) != 0) { - WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno)); + WARN("unlink shared memory %s failed, error: %s (%d)", tmphandle->shmPath, strerror(errno), errno); ret = ncclSystemError; } } @@ -139,7 +155,7 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) { if (tmphandle->shmPtr) { if (tmphandle->devShmPtr) CUDACHECK(cudaHostUnregister(tmphandle->shmPtr)); if (munmap(tmphandle->shmPtr, tmphandle->realShmSize) != 0) { - WARN("munmap of shared memory %p size %ld failed, error: %s", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno)); + WARN("munmap of shared memory %p size %ld failed, error: %s (%d)", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno), errno); ret = ncclSystemError; } } @@ -152,9 +168,9 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) { ncclResult_t ret = ncclSuccess; struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle; if (tmphandle) { - if (tmphandle->shmPath != NULL) { + if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) { if (unlink(tmphandle->shmPath) != 0) { - WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno)); + WARN("unlink shared memory %s failed, error: %s (%d)", tmphandle->shmPath, strerror(errno), errno); ret = ncclSystemError; } free(tmphandle->shmPath); @@ -184,7 +200,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff uint64_t t0 = clockNano(); while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) { if (clockNano() - t0 >= 5 * 1000) sched_yield(); - if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 1) { + if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE) == 1) { ret = ncclInternalError; goto exit; } diff --git a/src/misc/socket.cc b/src/misc/socket.cc index 6e9fb0790..9ade0e41d 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -34,7 +34,7 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr } } (*offset) += bytes; - if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) { + if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) { INFO(NCCL_NET, "socketProgressOpt: abort called"); return ncclInternalError; } @@ -620,12 +620,12 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) { do { NCCLCHECK(socketProgressState(sock)); } while (sock->asyncFlag == 0 && - (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED) == 0) && + (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) && (sock->state == ncclSocketStateConnecting || sock->state == ncclSocketStateConnectPolling || sock->state == ncclSocketStateConnected)); - if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError; + if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError; switch (sock->state) { case ncclSocketStateConnecting: @@ -667,11 +667,11 @@ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listen do { NCCLCHECKGOTO(socketProgressState(sock), ret, exit); } while (sock->asyncFlag == 0 && - (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED) == 0) && + (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE) == 0) && (sock->state == ncclSocketStateAccepting || sock->state == ncclSocketStateAccepted)); - if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError; + if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError; switch (sock->state) { case ncclSocketStateAccepting: diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc index ae6ade32f..608062bcc 100644 --- a/src/misc/tuner.cc +++ b/src/misc/tuner.cc @@ -9,117 +9,150 @@ #include #include +#include "checks.h" #include "debug.h" -#include "nccl_tuner.h" +#include "tuner.h" pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER; static int tunerPluginRefCount; static void* tunerPluginLib = nullptr; -ncclTuner_t* tunerSymbol = nullptr; +static ncclTuner_v3_t* tunerSymbol = nullptr; +static ncclTuner_v2_t* ncclTuner_v2 = nullptr; +static ncclTuner_v3_t ncclTuner_v2_as_v3; + +static int hasNvlsSupport(float** collCostTable) { + // Requirements for support of different algorithms: + // + // - NVLS intra-node: nvlsSupport + // - NVLS intra+inter-node: collNetSupport + // - NVLSTree intra-node: always disabled + // - NVLSTree inter-node: nvlsSupport + // - Collnet* inter-node: collNetSupport + // + // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1 + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0; +} + +static int hasCollNetSupport(float** collCostTable) { + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1; +} + +static ncclResult_t ncclTuner_v2_as_v3_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int* nChannels) { + int algorithm = NCCL_ALGO_UNDEF; + int protocol = NCCL_PROTO_UNDEF; + int nvlsSupport = hasNvlsSupport(collCostTable); + int collNetSupport = hasCollNetSupport(collCostTable); + NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels)); + // set time to 0 below to make sure this algorithm/protocol is selected later on + if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) { + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0; + } + return ncclSuccess; +} -static void* tryOpenDynamicLib(const char* name) { +static ncclResult_t ncclTuner_v2_as_v3_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) { + NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context)); + ncclTuner_v2_as_v3.name = ncclTuner_v2->name; + ncclTuner_v2_as_v3.getCollInfo = ncclTuner_v2_as_v3_getCollInfo; + ncclTuner_v2_as_v3.destroy = ncclTuner_v2->destroy; + return ncclSuccess; +} + +#define MAX_STR_LEN 255 + +static void* tryOpenLib(const char* name, int* err, char* errStr) { + *err = 0; if (nullptr == name || strlen(name) == 0) { return nullptr; } + + if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { + name = nullptr; + } + void *handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL); if (nullptr == handle) { - if (ENOENT == errno) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: No plugin found (%s)", name); + strncpy(errStr, dlerror(), MAX_STR_LEN); + errStr[MAX_STR_LEN] = '\0'; + if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { + *err = ENOENT; } } return handle; } -static void summarizeOpenTunerPluginLibErrors(char* pluginNames) { - const char *separator = " "; - int len = strlen(pluginNames); - // remove tail separator - pluginNames[len - 1] = '\0'; - - // remove last plugin name - while (len > 0 && pluginNames[--len] != *separator); - if (len > 0) { - pluginNames[len] = '\0'; - } - - // distinguish between one load attempt and multiple attempts - if (strstr(pluginNames, separator)) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Most recent plugin load returned %d : %s. All attempts to load '%s' also failed.", errno, dlerror(), pluginNames); - } else { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin load returned %d : %s : when loading %s", errno, dlerror(), pluginNames); +static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) { + if (openErr == ENOENT) { + snprintf(nameList, *nameListLen, " %s", name); + nameList += strlen(name) + 1; + *nameListLen -= strlen(name) + 1; + return nameList; } + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: %s", openErrStr); + return nameList; } -static void* openTunerPluginLib(void) { +static void* openTunerPluginLib(char* couldNotFindNames, int len) { + int openErr; void *pluginLib; - -#define MAX_PLUGIN_LOAD 4 - - int len; - char tunerPluginLibNameTried[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; - char *ptr = tunerPluginLibNameTried; char tunerPluginLibName[PATH_MAX]; + char openErrStr[MAX_STR_LEN + 1] = { 0 }; const char *envTunerPluginName = getenv("NCCL_TUNER_PLUGIN"); if (envTunerPluginName && strlen(envTunerPluginName)) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: NCCL_TUNER_PLUGIN set to %s", envTunerPluginName); snprintf(tunerPluginLibName, PATH_MAX, "%s", envTunerPluginName); - pluginLib = tryOpenDynamicLib(tunerPluginLibName); + pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); if (pluginLib) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); return pluginLib; } - len = PATH_MAX - strlen(ptr); - snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName); + couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner-%s.so", envTunerPluginName); - pluginLib = tryOpenDynamicLib(tunerPluginLibName); + pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); if (pluginLib) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); return pluginLib; } - len = PATH_MAX - strlen(ptr); - snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName); + couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); } else { snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner.so"); - pluginLib = tryOpenDynamicLib(tunerPluginLibName); + pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); if (pluginLib) { return pluginLib; } - len = PATH_MAX - strlen(ptr); - snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName); + couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); } const char *envNetPluginName = getenv("NCCL_NET_PLUGIN"); if (envNetPluginName && strlen(envNetPluginName)) { // Users are allowed to pack tuner into the net plugin snprintf(tunerPluginLibName, PATH_MAX, "%s", envNetPluginName); - pluginLib = tryOpenDynamicLib(tunerPluginLibName); + pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); if (pluginLib) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); return pluginLib; } - len = PATH_MAX - strlen(ptr); - snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName); + couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName); - pluginLib = tryOpenDynamicLib(tunerPluginLibName); + pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); if (pluginLib) { INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); return pluginLib; } - len = PATH_MAX - strlen(ptr); - snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName); + couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); } else { snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net.so"); - pluginLib = tryOpenDynamicLib(tunerPluginLibName); + pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); if (pluginLib) { return pluginLib; } - len = PATH_MAX - strlen(ptr); - snprintf(ptr + strlen(ptr), len + 1, "%s ", tunerPluginLibName); + couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); } - summarizeOpenTunerPluginLibErrors(ptr); - tunerPluginLibName[0] = '\0'; return nullptr; } @@ -130,10 +163,14 @@ enum { tunerPluginLoadSuccess = 1, }; -ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner) { +#define MAX_PLUGIN_LOAD 4 + +static int status = tunerPluginLoadReady; + +ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { // Initialize to nullptr by default if plugin tuner cannot be loaded. - *tuner = nullptr; - static int status = tunerPluginLoadReady; + char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; + comm->tuner = nullptr; if (tunerPluginLoadFailed == status) { return ncclSuccess; } @@ -144,28 +181,41 @@ ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner) { } if (tunerPluginLoadSuccess == status) { - *tuner = tunerSymbol; + comm->tuner = tunerSymbol; ++tunerPluginRefCount; goto exit; } - tunerPluginLib = openTunerPluginLib(); + tunerPluginLib = openTunerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX); if (nullptr == tunerPluginLib) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin."); + if (strlen(couldNotFindNames)) { + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Could not find:%s. Using internal tuner plugin.", couldNotFindNames); + } else { + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin."); + } goto fail; } - tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL); + tunerSymbol = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3"); if (tunerSymbol == nullptr) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find " NCCL_TUNER_PLUGIN_SYMBOL ", using internal tuner instead."); - dlclose(tunerPluginLib); - goto fail; + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol."); + ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2"); + if (ncclTuner_v2 == nullptr) { + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead."); + dlclose(tunerPluginLib); + goto fail; + } else { + ncclTuner_v2_as_v3.init = ncclTuner_v2_as_v3_init; + ncclTuner_v2_as_v3.name = ncclTuner_v2->name; + tunerSymbol = &ncclTuner_v2_as_v3; + } } INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", tunerSymbol->name); - *tuner = tunerSymbol; + comm->tuner = tunerSymbol; ++tunerPluginRefCount; status = tunerPluginLoadSuccess; + comm->tunerPluginLoaded = 1; exit: pthread_mutex_unlock(&tunerPluginLock); @@ -176,15 +226,16 @@ ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner) { goto exit; } -ncclResult_t ncclTunerPluginUnload(ncclTuner_t** tuner) { - if (*tuner == nullptr) return ncclSuccess; +ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) { pthread_mutex_lock(&tunerPluginLock); - if (0 == (--tunerPluginRefCount)) { + if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) { INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name); dlclose(tunerPluginLib); tunerPluginLib = nullptr; tunerSymbol = nullptr; - *tuner = nullptr; + comm->tuner = nullptr; + status = tunerPluginLoadReady; + comm->tunerPluginLoaded = 0; } pthread_mutex_unlock(&tunerPluginLock); return ncclSuccess; diff --git a/src/misc/utils.cc b/src/misc/utils.cc index 74d5b6d24..12504bc99 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -93,7 +93,8 @@ uint64_t getHostHash(void) { if ((hostId = ncclGetEnv("NCCL_HOSTID")) != NULL) { INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId); - strncpy(hostHash, hostId, sizeof(hostHash)); + strncpy(hostHash, hostId, sizeof(hostHash)-1); + hostHash[sizeof(hostHash)-1] = '\0'; } else { FILE *file = fopen(HOSTID_FILE, "r"); if (file != NULL) { @@ -291,79 +292,3 @@ void ncclMemoryStackDestruct(struct ncclMemoryStack* me) { h = h1; } } - -const char* ncclOpToString(ncclRedOp_t op) { - switch (op) { - case ncclSum: - return "ncclSum"; - case ncclProd: - return "ncclProd"; - case ncclMax: - return "ncclMax"; - case ncclMin: - return "ncclMin"; - case ncclAvg: - return "ncclAvg"; - default: - return "Unknown"; - } -} - -const char* ncclDatatypeToString(ncclDataType_t type) { - switch (type) { - case ncclInt8: // ncclChar - return "ncclInt8"; - case ncclInt32: // ncclInt - return "ncclInt32"; - case ncclUint32: - return "ncclUint32"; - case ncclInt64: - return "ncclInt64"; - case ncclUint64: - return "ncclUint64"; - case ncclFloat16: // ncclHalf - return "ncclFloat16"; - case ncclFloat32: // ncclFloat - return "ncclFloat32"; - case ncclFloat64: // ncclDouble - return "ncclFloat64"; -#if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: - return "ncclBfloat16"; -#endif - default: - return "Unknown"; - } -} - -const char* ncclAlgoToString(int algo) { - switch (algo) { - case NCCL_ALGO_TREE: - return "TREE"; - case NCCL_ALGO_RING: - return "RING"; - case NCCL_ALGO_COLLNET_DIRECT: - return "COLLNET_DIRECT"; - case NCCL_ALGO_COLLNET_CHAIN: - return "COLLNET_CHAIN"; - case NCCL_ALGO_NVLS: - return "NVLS"; - case NCCL_ALGO_NVLS_TREE: - return "NVLS_TREE"; - default: - return "Unknown"; - } -} - -const char* ncclProtoToString(int proto) { - switch (proto) { - case NCCL_PROTO_LL: - return "LL"; - case NCCL_PROTO_LL128: - return "LL128"; - case NCCL_PROTO_SIMPLE: - return "SIMPLE"; - default: - return "Unknown"; - } -} diff --git a/src/nccl.h.in b/src/nccl.h.in index 3cf619dcf..9efdf9fc1 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -47,6 +47,7 @@ typedef enum { ncclSuccess = 0, #define NCCL_CONFIG_UNDEF_INT INT_MIN #define NCCL_CONFIG_UNDEF_PTR NULL #define NCCL_SPLIT_NOCOLOR -1 +#define NCCL_UNDEF_FLOAT -1.0f /* Communicator configuration. Users can assign value to attributes to specify the * behavior of a communicator. */ @@ -78,6 +79,23 @@ typedef struct ncclConfig_v21700 { NCCL_CONFIG_UNDEF_INT /* splitShare */ \ } +/* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */ +typedef struct ncclSimInfo_v22200 { + size_t size; + unsigned int magic; + unsigned int version; + float estimatedTime; +} ncclSimInfo_t; + +/* NCCL_SIM_INFO_INITIALIZER must be assigned to initialize simInfo structure when it is created. + * Not initialized simInfo will result in NCCL error. */ +#define NCCL_SIM_INFO_INITIALIZER { \ + sizeof(ncclSimInfo_t), /* size */ \ + 0x74685283, /* magic */ \ + NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ + NCCL_UNDEF_FLOAT /* estimated time */ \ +} + /* NCCL malloc and free function for all types of NCCL optimizations * (e.g. user buffer registration). The actual allocated size might * be larger than requested due to granularity requirement. */ @@ -432,6 +450,14 @@ ncclResult_t pncclGroupStart(); ncclResult_t ncclGroupEnd(); ncclResult_t pncclGroupEnd(); +/* + * Group Simulate End + * + * Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct. + */ +ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo); +ncclResult_t pncclGroupSimulateEnd(ncclSimInfo_t* simInfo); + #ifdef __cplusplus } // end extern "C" #endif diff --git a/src/net.cc b/src/net.cc index e978a1854..0f5d336ea 100644 --- a/src/net.cc +++ b/src/net.cc @@ -48,7 +48,7 @@ static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8 } static ncclResult_t ncclNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1<<31) return ncclInternalError; + if (size >= 1UL<<31) return ncclInternalError; return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle); } @@ -95,7 +95,7 @@ static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8 } static ncclResult_t ncclNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1<<31) return ncclInternalError; + if (size >= 1UL<<31) return ncclInternalError; return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle); } @@ -150,7 +150,7 @@ static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8 } static ncclResult_t ncclNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1<<31) return ncclInternalError; + if (size >= 1UL<<31) return ncclInternalError; return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle); } @@ -207,7 +207,7 @@ static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetPropertie } static ncclResult_t ncclCollNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1<<31) return ncclInternalError; + if (size >= 1UL<<31) return ncclInternalError; return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle); } @@ -254,7 +254,7 @@ static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetPropertie } static ncclResult_t ncclCollNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1<<31) return ncclInternalError; + if (size >= 1UL<<31) return ncclInternalError; return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle); } @@ -301,7 +301,7 @@ static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetPropertie } static ncclResult_t ncclCollNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1<<31) return ncclInternalError; + if (size >= 1UL<<31) return ncclInternalError; return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle); } @@ -339,90 +339,109 @@ enum ncclNetState { enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; -static void* tryOpenDynamicLib(char* name) { +#define MAX_STR_LEN 255 + +static void* tryOpenLib(char* name, int* err, char* errStr) { + *err = 0; if (nullptr == name || strlen(name) == 0) { return nullptr; } + + if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { + name = nullptr; + } + void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL); if (nullptr == handle) { - if (ENOENT == errno) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: No plugin found (%s)", name); - } else { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin load returned %d : %s when loading %s", errno, dlerror(), name); + strncpy(errStr, dlerror(), MAX_STR_LEN); + errStr[MAX_STR_LEN] = '\0'; + if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { + *err = ENOENT; } } return handle; } -static void summarizeOpenNetPluginErrors(char* pluginNames) { - const char *separator = " "; - int len = strlen(pluginNames); - // remove tail separator - pluginNames[len - 1] = '\0'; - - // remove last plugin name - while (len > 0 && pluginNames[--len] != *separator); - if (len > 0) { - pluginNames[len] = '\0'; - } - - // distinguish between one load attempt and multiple attempts - if (strstr(pluginNames, separator)) { - INFO(NCCL_ENV|NCCL_TUNING, "NET/Plugin: Most recent plugin load returned %d : %s. All attempts to load '%s' also failed.", errno, dlerror(), pluginNames); - } else { - INFO(NCCL_ENV|NCCL_TUNING, "NET/Plugin: Plugin load returned %d : %s : when loading %s", errno, dlerror(), pluginNames); +static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) { + if (openErr == ENOENT) { + snprintf(nameList, *nameListLen, " %s", name); + nameList += strlen(name) + 1; + *nameListLen -= strlen(name) + 1; + return nameList; } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: %s", openErrStr); + return nameList; } -static void* openNetPluginLib(void) { +static void* openNetPluginLib(char* couldNotFindNames, int len) { + int openErr; void *pluginLib; - -#define MAX_PLUGIN_LOAD 2 - - int len; - char netPluginLibNameTried[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; - char *ptr = netPluginLibNameTried; char netPluginLibName[PATH_MAX]; + char openErrStr[MAX_STR_LEN + 1] = { 0 }; const char *envNetPluginName = getenv("NCCL_NET_PLUGIN"); if (envNetPluginName && strlen(envNetPluginName)) { snprintf(netPluginLibName, PATH_MAX, "%s", envNetPluginName); - pluginLib = tryOpenDynamicLib(netPluginLibName); + pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr); if (pluginLib) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName); return pluginLib; } - len = PATH_MAX - strlen(ptr); - snprintf(ptr + strlen(ptr), len + 1, "%s ", netPluginLibName); + couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName); snprintf(netPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName); - pluginLib = tryOpenDynamicLib(netPluginLibName); + pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr); if (pluginLib) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName); return pluginLib; } - len = PATH_MAX - strlen(ptr); - snprintf(ptr + strlen(ptr), len + 1, "%s ", netPluginLibName); + couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName); } else { snprintf(netPluginLibName, PATH_MAX, "libnccl-net.so"); - pluginLib = tryOpenDynamicLib(netPluginLibName); + pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr); if (pluginLib) { return pluginLib; } - len = PATH_MAX - strlen(ptr); - snprintf(ptr + strlen(ptr), len + 1, "%s ", netPluginLibName); + couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName); } - summarizeOpenNetPluginErrors(ptr); - return nullptr; } -ncclResult_t ncclNetPluginInit() { - void* netPluginLib = openNetPluginLib(); - if (netPluginLib == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Using internal network plugin."); +static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER; +static int netPluginRefCount; +static void* netPluginLib; + +enum { + netPluginLoadFailed = -1, + netPluginLoadReady = 0, + netPluginLoadSuccess = 1, +}; + +static int netPluginStatus = netPluginLoadReady; + +#define MAX_PLUGIN_LOAD 2 + +ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) { + char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; + if (netPluginLoadFailed == netPluginStatus) { return ncclSuccess; } + pthread_mutex_lock(&netPluginLock); + if (netPluginLoadSuccess == netPluginStatus) { + ++netPluginRefCount; + goto exit; + } + + netPluginLib = openNetPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX); + if (netPluginLib == nullptr) { + if (strlen(couldNotFindNames)) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Could not find:%s. Using internal network plugin.", couldNotFindNames); + } else { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Using internal network plugin."); + } + goto fail; + } + ncclNets[0] = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8"); if (ncclNets[0] == nullptr) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol."); @@ -436,8 +455,7 @@ ncclResult_t ncclNetPluginInit() { ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); if (ncclNet_v5 == nullptr) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported."); - if (netPluginLib != nullptr) dlclose(netPluginLib); - return ncclSuccess; + goto fail; } else { ncclNets[0] = &ncclNet_v5_as_v8; ncclNet_v5_as_v8.init = ncclNet_v5_as_v8_init; @@ -476,21 +494,52 @@ ncclResult_t ncclNetPluginInit() { ncclCollNets[0] = &ncclCollNet_v5_as_v8; ncclCollNet_v5_as_v8.init = ncclCollNet_v5_as_v8_init; ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name); } } else { ncclCollNets[0] = &ncclCollNet_v6_as_v8; ncclCollNet_v6_as_v8.init = ncclCollNet_v6_as_v8_init; ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v6)", ncclCollNets[0]->name); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name); } } else { ncclCollNets[0] = &ncclCollNet_v7_as_v8; ncclCollNet_v7_as_v8.init = ncclCollNet_v7_as_v8_init; ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v7)", ncclCollNets[0]->name); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name); } } + + ++netPluginRefCount; + netPluginStatus = netPluginLoadSuccess; + comm->netPluginLoaded = 1; + +exit: + pthread_mutex_unlock(&netPluginLock); + return ncclSuccess; +fail: + if (netPluginLib) dlclose(netPluginLib); + netPluginStatus = netPluginLoadFailed; + goto exit; +} + +ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) { + pthread_mutex_lock(&netPluginLock); + if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) { + if (ncclNets[0]) { + INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name); + } + if (ncclCollNets[0]) { + INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name); + } + dlclose(netPluginLib); + netPluginLib = nullptr; + ncclNets[0] = nullptr; + ncclCollNets[0] = nullptr; + netPluginStatus = netPluginLoadReady; + comm->netPluginLoaded = 0; + } + pthread_mutex_unlock(&netPluginLock); return ncclSuccess; } @@ -515,8 +564,6 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in return ncclInternalError; } - INFO(NCCL_INIT, "Using non-device net plugin version %d", - props.netDeviceVersion); return ncclSuccess; } @@ -582,6 +629,12 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) { return ncclSuccess; } +ncclResult_t ncclNetFinalize(struct ncclComm* comm) { + comm->ncclNet = nullptr; + comm->ncclCollNet = nullptr; + return ncclSuccess; +} + ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { constexpr int GPU_BUF_SIZE = 2*1024*1024; #if CUDART_VERSION >= 11030 @@ -623,7 +676,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { while (!connected) { // If we're aborting now, skip to cleanup - if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) { + if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) { goto cleanup2; } diff --git a/src/proxy.cc b/src/proxy.cc index 955c415ec..eef71a565 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -12,6 +12,7 @@ #include "profiler.h" #define ENABLE_TIMER 0 #include "timer.h" +#include "transport.h" #include #include @@ -596,67 +597,6 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool return ncclSuccess; } -NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0); - -ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op, int reg) { - memset(op, 0, sizeof(struct ncclProxyOp)); - int channelId = info->channelId; - struct ncclChannel* channel = info->comm->channels+channelId; - op->channelId = channelId; - op->sliceSteps = 1; - op->chunkSteps = 1; - op->dtype = info->datatype; - op->protocol = info->protocol; - - int stepSize = info->comm->buffSizes[op->protocol]/NCCL_STEPS; - - if (op->protocol == NCCL_PROTO_SIMPLE) stepSize = info->comm->p2pChunkSize; - info->chunkSize = stepSize; - op->root = info->root; - - struct ncclChannelPeer* peer = channel->peers[op->root]; - if (info->coll == ncclFuncSend) { - op->pattern = ncclPatternSend; - if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) { - // Tune chunk size for the network - if (info->protocol == NCCL_PROTO_SIMPLE && info->count < stepSize) info->chunkSize /= 4; - else if (info->count < 8*stepSize) info->chunkSize /= 2; - if (info->protocol == NCCL_PROTO_SIMPLE && peer->send[1].proxyConn.sameProcess) op->reg = reg; - } - } else if (info->coll == ncclFuncRecv) { - op->pattern = ncclPatternRecv; - if (op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) { - // Tune chunk size for the network - if (info->protocol == NCCL_PROTO_SIMPLE && info->count < stepSize) info->chunkSize /= 4; - else if (info->count < 8*stepSize) info->chunkSize /= 2; - if (info->protocol == NCCL_PROTO_SIMPLE && peer->recv[1].proxyConn.sameProcess) op->reg = reg; - } - } else { - WARN("P2p operation is neither send or recv"); - return ncclInternalError; - } - if (ncclParamChunkSize() != 0) { - info->chunkSize = ncclParamChunkSize(); - } - op->recvbuff = op->reg ? (uint8_t*)info->recvbuff : NULL; - op->chunkSize = info->chunkSize; - op->nbytes = info->count; - - // Compute nSteps for proxies - int chunkEffectiveSize = op->chunkSize; - if (op->protocol == NCCL_PROTO_LL) { - chunkEffectiveSize /= 2; - op->nbytes *= 2; - op->nbytes = DIVUP(op->nbytes, sizeof(union ncclLLFifoLine)) * sizeof(union ncclLLFifoLine); - } - - if (!op->reg) op->nbytes = std::min(op->nbytes, (ssize_t)info->chunkSize); - op->nsteps = DIVUP(info->count, chunkEffectiveSize); - if (op->nsteps == 0 || op->reg) op->nsteps = 1; - - return ncclSuccess; -} - static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) { struct ncclProxyArgs* freeOp = *opPtr; struct ncclProxyArgs* next = freeOp->next; @@ -870,7 +810,8 @@ void* ncclProxyProgress(void *proxyState_) { * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */ int proxyOpAppendCounter = 0; struct ncclProxyArgs profArgs; // Only used for profiling purposes - while ((state->stop == 0 || (state->stop == 1 && state->active)) && *proxyState->abortFlag == 0) { + while ((state->stop == 0 || (state->stop == 1 && state->active)) && + __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0) { int idle = 1; ncclResult_t ret = progressOps(proxyState, state, state->active, &idle); if (ret != ncclSuccess) { @@ -1075,7 +1016,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in strncpy(poolPath+sizeof("/dev/shm/nccl-")-1, resp.devShmPath, sizeof("XXXXXX")-1); struct ncclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank; if (proxyOps->pool == NULL) { - NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0, &proxyOps->handle)); + NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle)); proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1; } } @@ -1172,7 +1113,7 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) { struct ncclProxyState* sharedProxyState = comm->proxyState; // Receive the connection pointer from the Proxy - if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) { + if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) { WARN("Comm %p is in abort state", comm); return ncclInternalError; } @@ -1254,7 +1195,7 @@ static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) { char shmPath[sizeof("/dev/shm/nccl-XXXXXX")]; shmPath[0] = '\0'; - NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks + 1, &state->handle)); + NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks, &state->handle)); // Init pool pool->nextOps = -1; @@ -1403,7 +1344,7 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP (*asyncOpCount)--; return ncclSuccess; - } else if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED) != 0) { + } else if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) { return ncclInternalError; } @@ -1491,7 +1432,7 @@ void* ncclProxyService(void* _args) { /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer * connections. Need to wait until all other related comms call abort and safely exit * together, or we could face segmentation fault. */ - if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED) != 0) stop = 1; + if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = 1; /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */ int ret; do { @@ -1721,14 +1662,13 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) { } if (sharedProxyState->peerAddresses) { - if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0) { - struct ncclSocket sock; - int type = ncclProxyMsgStop; - NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag)); - NCCLCHECK(ncclSocketConnect(&sock)); - NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int))); - NCCLCHECK(ncclSocketClose(&sock)); + struct ncclSocket sock; + int type = ncclProxyMsgStop; + ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag); + if (ncclSocketConnect(&sock) == ncclSuccess) { + ncclSocketSend(&sock, &type, sizeof(int)); } + ncclSocketClose(&sock); } if (sharedProxyState->peerSocks) { @@ -1746,7 +1686,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) { } } int type = ncclProxyMsgClose; - if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0) NCCLCHECK(ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int))); + ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)); NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i)); } } diff --git a/src/register.cc b/src/register.cc index c46899294..90d429fe4 100644 --- a/src/register.cc +++ b/src/register.cc @@ -9,6 +9,7 @@ #include "comm.h" #include "net.h" #include "register.h" +#include "transport.h" ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) { struct ncclRegCache* cache = &comm->regCache; @@ -79,6 +80,7 @@ ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, str } } end: + INFO(NCCL_INIT, "Register ptr %p size %ld on %d net devices", addr, size, reg->nDevs); ncclDebugNoWarn = 0; if (ret != ncclSuccess) NCCLCHECK(ncclNetDeregister(comm, reg)); return ret; diff --git a/src/transport.cc b/src/transport.cc index 710285680..5df47065b 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -9,6 +9,7 @@ #include "bootstrap.h" #define ENABLE_TIMER 0 #include "timer.h" +#include "transport.h" struct ncclTransport* ncclTransports[NTRANSPORTS] = { &p2pTransport, @@ -72,7 +73,7 @@ NCCL_PARAM(ReportConnectProgress, "REPORT_CONNECT_PROGRESS", 0); ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) { // Stream used during transport setup; need for P2P pre-connect + CUDA Graph ncclResult_t ret = ncclSuccess; - int highestType = TRANSPORT_P2P; // track highest transport type + int highestType = TRANSPORT_UNDEFINED; // track highest transport type struct ncclConnect** data; // Store intermediate send/recvData structs for connect struct ncclConnect** recvData; // Points to entries inside data for given recv connection within a channel struct ncclConnect** sendData; // Points to entries inside data for given send connection within a channel @@ -215,13 +216,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* } } - if (timeReported) { + { struct timeval now; gettimeofday(&now, NULL); float elapsed = (now.tv_sec - timeStart.tv_sec)*1.0 + (now.tv_usec-timeStart.tv_usec)*1e-6; - printf("\rP2p connect done in %d:%02d \n", - ((int)elapsed)/60, ((int)elapsed)%60); - fflush(stdout); + if (elapsed > 1.0) INFO(NCCL_PROFILE, "timings: rank %d nranks %d P2p connect done in %.2f", comm->rank, comm->nRanks, elapsed); + if (timeReported) { + printf("\rP2p connect done in %d:%02d \n", + ((int)elapsed)/60, ((int)elapsed)%60); + fflush(stdout); + } } /* We need to sync ranks here since some ranks might run too fast after connection setup diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index abe051822..ae1fe0fb5 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -9,7 +9,10 @@ #include "graph.h" #include "proxy.h" #include "gdrwrap.h" +#include "transport.h" #include "assert.h" +#include "bootstrap.h" +#include "channel.h" int64_t ncclParamGdrCopySyncEnable(); int64_t ncclParamGdrCopyFlushEnable(); @@ -1052,7 +1055,23 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u goto exit; } -ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKernelPlan *plan, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) { +struct ncclCollnetCleanupCallback { + struct ncclCommCallback base; + struct ncclProxyConnector* proxyConn; + void* buffer; + size_t size; + void* mhandle; +}; + +static ncclResult_t cleanupCollnet(struct ncclComm* comm, struct ncclCommCallback* cb) { + struct ncclCollnetCleanupCallback* obj = (struct ncclCollnetCleanupCallback*)cb; + NCCLCHECK(ncclCollnetDeregBuffer(comm, obj->proxyConn, obj->mhandle)); + INFO(NCCL_REG, "rank %d - deregistered collnet buffer handle %p, size %ld, buff %p", comm->rank, obj->mhandle, obj->size, obj->buffer); + free(obj); + return ncclSuccess; +} + +ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts) { ncclResult_t ret = ncclSuccess; void* handle = NULL; struct ncclRegCache* cache = &comm->regCache; @@ -1060,18 +1079,20 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKe uintptr_t addr = (uintptr_t)userbuff & -pageSize; size_t size = DIVUP((uintptr_t)userbuff - addr + buffSize, pageSize) * pageSize; collnetRegInfo info = {addr, size}; - struct ncclCollnetHandleList* record = NULL; + struct ncclCollnetCleanupCallback* record = NULL; struct ncclProxyConnector* proxyConn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn; *outRegBufFlag = 0; NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail); - record = ncclMemoryPoolAlloc(&comm->memPool_ncclCollnetHandleList, &comm->memPermanent); - record->proxyconn = proxyConn; - record->buffer = userbuff; + record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback)); + record->base.fn = cleanupCollnet; + record->proxyConn = proxyConn; + record->buffer = (void*)userbuff; record->size = buffSize; - *outHandle = record->collnetHandle = handle; + *outHandle = record->mhandle = handle; *outRegBufFlag = 1; - ncclIntruQueueEnqueue(&plan->collnetHandleQueue, record); + ncclIntruQueueEnqueue(cleanupQueue, &record->base); + *nCleanupQueueElts += 1; exit: return ret; @@ -1140,3 +1161,269 @@ struct ncclTransport collNetTransport = { { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer }, { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer } }; + +ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) { + ncclResult_t ret = ncclSuccess; + char line[1024]; + + if (comm->collNetSupport == 0) goto exit; + // Connect Collnet + chain + for (int c = 0; c < comm->nChannels; c++) { + struct ncclChannel* channel = comm->channels + c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, fail); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_CHAIN], 0), ret, fail); + for (int c = 0; c < comm->nChannels; c++) { + struct ncclChannel* channel = comm->channels + c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, fail); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_CHAIN], 1), ret, fail); + + line[0] = '\0'; + for (int c = 0; c < comm->nChannels; c++) { + struct ncclTree* chain = &comm->channels[c].collnetChain; + snprintf(line + strlen(line), 1023 - strlen(line), " [%d] %d->%d->%d", + c, chain->down[0], comm->rank, chain->up); + } + line[1023] = '\0'; + + INFO(NCCL_INIT, "Connected Collnet Chains %s", line); + +exit: + return ret; +fail: + goto exit; +} + +ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) { + ncclResult_t ret = ncclSuccess; + int highestTransportType0 = TRANSPORT_UNDEFINED, highestTransportType1 = TRANSPORT_UNDEFINED; + + if (comm->collNetSupport == 0) goto exit; + + // Connect intra-node CollNet + Direct + for (int c = 0; c < comm->nChannels; c++) { + struct ncclChannel* channelRecv = comm->channels + c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 0, &highestTransportType0), ret, fail); + + for (int c = 0; c < comm->nChannels; c++) { + struct ncclChannel* channelSend = comm->channels + c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1, &highestTransportType1), ret, fail); + + // Exchange highest intra-node transport type among ranks + // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer + if (highestTransportType0 != TRANSPORT_UNDEFINED && highestTransportType1 != TRANSPORT_UNDEFINED) { + int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_UNDEFINED }; + + comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; + NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail); + for (int i = 0; i < comm->localRanks; i++) { + if (highestTypes[i] > comm->intraHighestTransportType) + comm->intraHighestTransportType = highestTypes[i]; + } + if (comm->collNetSharedRes->intraHighestTransportType < comm->intraHighestTransportType) + comm->collNetSharedRes->intraHighestTransportType = comm->intraHighestTransportType; + } else if (comm->intraHighestTransportType == TRANSPORT_UNDEFINED) { + // reuse previous shared intraHighestTransportType + comm->intraHighestTransportType = comm->collNetSharedRes->intraHighestTransportType; + } + INFO(NCCL_INIT, "rank %d Connected CollNet", comm->rank); + +exit: + return ret; +fail: + goto exit; +} + +static ncclResult_t collNetInitRailRankMap(ncclComm_t comm) { + int rank = comm->rank; + uint64_t nonHeadMask = (1ull << comm->localRanks) - 1; + + comm->collNetDenseToUserRank = ncclMemoryStackAlloc(&comm->memPermanent, comm->nRanks); + comm->collNetUserToDenseRank = ncclMemoryStackAlloc(&comm->memPermanent, comm->nRanks); + // initialize collNetUserToDenseRank[rank] + comm->collNetUserToDenseRank[rank] = -1; + for (int h = 0; h < comm->collNetHeadsNum; h++) { + nonHeadMask ^= 1ull << comm->rankToLocalRank[comm->collNetHeads[h]]; + if (comm->collNetHeads[h] == rank) { comm->collNetUserToDenseRank[rank] = h; break; } + } + if (comm->collNetUserToDenseRank[rank] == -1) { + comm->collNetUserToDenseRank[rank] = __builtin_popcountll(nonHeadMask & ((1ull << comm->localRank) - 1)); + } + comm->collNetUserToDenseRank[rank] += comm->node * comm->localRanks; + + NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->collNetUserToDenseRank, sizeof(int))); + for (int r = 0; r < comm->nRanks; r++) { + comm->collNetDenseToUserRank[comm->collNetUserToDenseRank[r]] = r; + } + return ncclSuccess; +} + +ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]) { + ncclResult_t ret = ncclSuccess; + int rank = comm->rank; + int collNetSetupFail = 0; + // Find all head ranks + int nHeadsUnique = 0; + int* headsUnique = NULL; + bool share; + struct ncclTopoGraph* directGraph = graphs[NCCL_ALGO_COLLNET_DIRECT]; + + struct collnetShareInfo { + int headPosition; + int isMaster; + }; + struct collnetShareInfo* infos = NULL; + + NCCLCHECKGOTO(ncclCalloc(&headsUnique, directGraph->nChannels), ret, fail); + { uint64_t mask = 0; + // Head GPU index is always 0 + for (int c = 0; c < directGraph->nChannels; c++) { + int head = directGraph->intra[c * comm->localRanks + 0]; + assert(comm->rankToNode[head] == comm->node); + uint64_t mask0 = mask; + mask |= 1ull<rankToLocalRank[head]; + if (mask != mask0) headsUnique[nHeadsUnique++] = head; + } + } + + comm->collNetHeads = headsUnique; + comm->collNetHeadsNum = nHeadsUnique; + if (parent && parent->collNetSupport && parent->nNodes == comm->nNodes) { + if (!parent->config.splitShare) { + collNetSetupFail = 1; + goto fail; + } + NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail); + /* check whether child can share collnet resources of parent. Since parent builds each collnet communicator + * based on heads with the same head position in each node, as long as the collnet heads of child comm + * can match parent's heads, we can let child communicator share parent's collnet resources. */ + for (int h = 0; h < nHeadsUnique; ++h) { + int prev = INT_MIN; + struct collnetShareInfo* myinfo; + + share = true; + myinfo = infos + comm->rank; + memset(myinfo, 0, sizeof(struct collnetShareInfo)); + /* find the child head position in parent collnet heads. */ + if (headsUnique[h] == comm->rank) { + myinfo->headPosition = -1; + myinfo->isMaster = 1; + for (int th = 0; th < parent->collNetHeadsNum; ++th) + if (parent->topParentRanks[parent->collNetHeads[th]] == comm->topParentRanks[comm->rank]) { + myinfo->headPosition = th; + break; + } + } + + NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, infos, sizeof(struct collnetShareInfo)), ret, fail); + for (int i = 0; i < comm->nRanks; ++i) { + if (infos[i].isMaster) { + if (prev == INT_MIN) + prev = infos[i].headPosition; + + if (infos[i].headPosition == -1 || prev != infos[i].headPosition) { + share = false; + break; + } + } + } + + if (share) { + if (myinfo->isMaster) { + comm->collNetSharedRes = parent->collNetSharedRes; + for (int c = 0; c < comm->nChannels; ++c) + NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail); + } + + NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail); + } else { + /* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot + * share the sharp resource from parent, we cannot use sharp in this case. This restriction might be + * lifted by sharp plugin/IB hardware in the future. */ + collNetSetupFail = 1; + if (comm->rank == 0) { + WARN("Child comms (nRanks %d) fails to share parent comms (nRanks %d) sharp resources", comm->nRanks, parent->nRanks); + } + goto fail; + } + } + share = true; + } else { + /* this allocated buffer will be freed on proxy side */ + NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1)); + comm->collNetSharedRes->nChannels = comm->nChannels; + comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE]; + + NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail); + + for (int c = 0; c < comm->nChannels; c++) { + struct ncclChannel* channel = comm->channels + c; + NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail); + for (int h = 0; h < nHeadsUnique; h++) { + const int head = headsUnique[h]; + ncclConnect connect; + collNetSetupFail |= ncclTransportCollNetSetup(comm, directGraph, channel, head, head, h, collNetRecv, &connect); + if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, directGraph, channel, head, head, h, collNetSend, &connect); + } + // Verify CollNet setup across ranks after trying the first channel + if (c == 0) { + NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); + } + } + share = false; + } + + if (share) { + memcpy(comm->collNetSupportMatrix, parent->collNetSupportMatrix, sizeof(comm->collNetSupportMatrix)); + } else { + do { + /* Initialize all entries in collNetSupportMatrix[redop][type]. Since some + ranks don't connect to sharp we enable a (redop,type) if any rank claims + support. */ + uint8_t(*matrix)[4][ncclNumTypes]; + bool isHead = false; + matrix = nullptr; + NCCLCHECKGOTO(ncclCalloc(&matrix, comm->nRanks), ret, matrix_end); + for (int h = 0; h < nHeadsUnique; h++) isHead |= (headsUnique[h] == comm->rank); + if (isHead) { + for (int ty=0; ty < ncclNumTypes; ty++) { + for (int op=0; op < 4; op++) { + int support = 0; + NCCLCHECKGOTO(collNetReduceSupport(comm, (ncclDataType_t)ty, (ncclRedOp_t)op, &support), ret, matrix_end); + // bit 0 = not supported, bit 1 = supported + matrix[rank][op][ty] = 1<<(support ? 1 : 0); + } + } + } + NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, matrix, sizeof(*matrix)), ret, matrix_end); + for (int ty=0; ty < ncclNumTypes; ty++) { + for (int op=0; op < 4; op++) { + uint8_t accum = 0; + for (int r=0; r < comm->nRanks; r++) accum |= matrix[r][op][ty]; + // We support (redop, type) if some rank supports it and no rank doesn't support it + comm->collNetSupportMatrix[op][ty] = (accum == (1<<1)); + } + } + matrix_end: + free(matrix); + if (ret != ncclSuccess) goto fail; + } while (0); + } + + // Verify CollNet setup across ranks after trying all channels + NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); + TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank); + +exit: + free(infos); + return ret; +fail: + ncclTransportCollNetFree(comm); + comm->collNetSupport = 0; + goto exit; +} diff --git a/src/transport/generic.cc b/src/transport/generic.cc new file mode 100644 index 000000000..a0efaab5c --- /dev/null +++ b/src/transport/generic.cc @@ -0,0 +1,36 @@ +#include "comm.h" +#include "transport.h" + +ncclResult_t ncclTransportRingConnect(struct ncclComm* comm) { + ncclResult_t ret = ncclSuccess; + if (comm && comm->nRanks > 1) { + for (int c = 0; c < comm->nChannels; c++) { + struct ncclChannel* channel = comm->channels + c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_RING], 0), ret, fail); + INFO(NCCL_INIT, "Connected all rings"); + } +exit: + return ret; +fail: + goto exit; +} + +ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm) { + ncclResult_t ret = ncclSuccess; + if (comm && comm->nRanks > 1) { + // Connect Trees + for (int c = 0; c < comm->nChannels; c++) { + struct ncclChannel* channel = comm->channels + c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail); + INFO(NCCL_INIT, "Connected all trees"); + } +exit: + return ret; +fail: + goto exit; +} diff --git a/src/transport/net.cc b/src/transport/net.cc index cc388211c..d5a585d42 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -13,6 +13,7 @@ #include "shm.h" #include "p2p.h" #include "profiler.h" +#include "transport.h" static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large"); @@ -238,6 +239,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph } static ncclResult_t netMapShm(struct connectMapMem* mem) { + mem->cpuPtr = NULL; + mem->gpuPtr = NULL; NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, -1, &mem->attachHandle)); return ncclSuccess; } @@ -303,8 +306,12 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne } ncclResult_t ret; - NCCLCHECK(ret = ncclPollProxyResponse(comm, &send->proxyConn, map, opId)); - if (ret == ncclInProgress) { + ret = ncclPollProxyResponse(comm, &send->proxyConn, map, opId); + if (ret != ncclSuccess) { + if (ret != ncclInProgress) { + free(map); + send->transportResources = NULL; + } return ret; } INFO(NCCL_PROXY, "sendConnect ncclPollProxyResponse opId=%p", opId); @@ -323,6 +330,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) { if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM)); if (map->mems[NCCL_NET_MAP_DEVMEM].size) { + map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL; NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank, map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, @@ -332,6 +340,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) { void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank; if (*sharedDevMemPtr == NULL) { + map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = NULL; NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size, &map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc, @@ -403,7 +412,11 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne ncclResult_t ret; NCCLCHECK(ret = ncclPollProxyResponse(comm, &recv->proxyConn, map, opId)); - if (ret == ncclInProgress) { + if (ret != ncclSuccess) { + if (ret != ncclInProgress) { + free(map); + recv->transportResources = NULL; + } return ret; } INFO(NCCL_PROXY, "recvConnect ncclPollProxyResponse opId=%p", opId); @@ -1264,7 +1277,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ptrs[subCount] = localBuff+buffSlot*stepSize; sizes[subCount] = stepSize*args->sliceSteps; } - sizes[subCount] = stepSize*args->sliceSteps; if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes; tags[subCount] = resources->tpRemoteRank; mhandles[subCount] = sub->mhandle; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 551ca61fd..be8a8a37b 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -78,6 +78,7 @@ pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER; static int ncclIbRelaxedOrderingEnabled = 0; NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1); +NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1); NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2); NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18); NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7); @@ -88,6 +89,7 @@ NCCL_PARAM(IbTc, "IB_TC", 0); NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192); NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2); NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2); +NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", 0); pthread_t ncclIbAsyncThread; static void* ncclIbAsyncThreadMain(void* args) { @@ -289,7 +291,38 @@ static ncclResult_t ncclUpdateGidIndex(struct ibv_context* context, uint8_t port return ncclSuccess; } -static ncclResult_t ncclIbGetGidIndex(struct ibv_context *context, uint8_t portNum, int gidTblLen, int *gidIndex) { +// GID Format +// global: | 64b - subnet-prefix | 64b - EUI | +// raw : | 10b fixed | 22b 0 | 16b FLID | 16b subnet-prefix | 64b - EUI | +static uint16_t ncclIbExtractLocalSubnetPrefix(uint64_t subnet_prefix) +{ + return (be64toh(subnet_prefix) & 0xffff); +} + +static int ncclIbExtractFlid (union ibv_gid *gid) +{ + return ntohs(*((uint16_t*)((uintptr_t)(gid->raw) + 4))); +} + +static ncclResult_t ncclIbGetGidIndex(struct ibv_context *context, uint8_t portNum, struct ibv_port_attr* portAttr, int *gidIndex) { + int gidTblLen = portAttr->gid_tbl_len; + + //for IB, choose GID Index that will have routable FLID if present + if (portAttr->link_layer == IBV_LINK_LAYER_INFINIBAND) { + union ibv_gid gid; + int routableGidIndex = ncclParamIbRoutableFlidIbGidIndex(); + if (routableGidIndex < gidTblLen) { + NCCLCHECK(wrap_ibv_query_gid(context, portNum, routableGidIndex, &gid)); + if (ncclIbExtractFlid(&gid) != 0) { + *gidIndex = routableGidIndex; + return ncclSuccess; + } + } + *gidIndex = 0; + return ncclSuccess; + } + + //for ROCE *gidIndex = ncclParamIbGidIndex(); if (*gidIndex >= 0) { return ncclSuccess; @@ -420,6 +453,10 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; } + // Should NCCL merge multi-port devices into one? + int mergeNics; + mergeNics = ncclParamIbMergeNics(); +build_ib_list: for (int d=0; dlink_layer == IBV_LINK_LAYER_ETHERNET) { qpAttr.ah_attr.is_global = 1; - qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn; - qpAttr.ah_attr.grh.dgid.global.interface_id = info->iid; + qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->gid.global.subnet_prefix; + qpAttr.ah_attr.grh.dgid.global.interface_id = info->gid.global.interface_id; qpAttr.ah_attr.grh.flow_label = 0; - qpAttr.ah_attr.grh.sgid_index = sGidIndex; + qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex; qpAttr.ah_attr.grh.hop_limit = 255; - qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc(); + if(ncclParamIbFifoTc() && override_tc) { + qpAttr.ah_attr.grh.traffic_class = ncclParamIbFifoTc(); + } else { + qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc(); + } } else { - qpAttr.ah_attr.is_global = 0; - qpAttr.ah_attr.dlid = info->lid; + //pick lid if subnet prefixs are same, FLID if they are not + if (ncclIbExtractLocalSubnetPrefix(sGidInfo->localGid.global.subnet_prefix) == + ncclIbExtractLocalSubnetPrefix(info->gid.global.subnet_prefix)) { + qpAttr.ah_attr.is_global = 0; + qpAttr.ah_attr.dlid = info->lid; + } else { + uint16_t flid = ncclIbExtractFlid(&info->gid); + if (flid == 0) { + WARN("Warning: remote FLID configured as zero even when endpoints are on different subnets, using dlid as fallback"); + qpAttr.ah_attr.dlid = info->lid; + } else { + qpAttr.ah_attr.dlid = ncclIbExtractFlid(&info->gid); + } + qpAttr.ah_attr.is_global = 1; + qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->gid.global.subnet_prefix; + qpAttr.ah_attr.grh.dgid.global.interface_id = info->gid.global.interface_id; + qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex; + qpAttr.ah_attr.grh.hop_limit = 255; + } } qpAttr.ah_attr.sl = ncclParamIbSl(); qpAttr.ah_attr.src_path_bits = 0; @@ -1041,22 +1115,22 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet NCCLCHECK(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); devInfo->fifoRkey = commDev->fifoMr->rkey; - // RoCE support + // Pack local GID info devInfo->link_layer = commDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer; - if (devInfo->link_layer == IBV_LINK_LAYER_ETHERNET) { - NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, ibDev->portAttr.gid_tbl_len, &commDev->base.gidInfo.localGidIndex)); - NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid)); - devInfo->spn = commDev->base.gidInfo.localGid.global.subnet_prefix; - devInfo->iid = commDev->base.gidInfo.localGid.global.interface_id; - } + NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex)); + NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid)); + devInfo->gid.global.subnet_prefix = commDev->base.gidInfo.localGid.global.subnet_prefix; + devInfo->gid.global.interface_id = commDev->base.gidInfo.localGid.global.interface_id; + // info logging if (devInfo->link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB for (int q = 0; q < comm->base.nqps; q++) { // Print just the QPs for this dev if (comm->base.qps[q].devIndex == i) - INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d fifoRkey=0x%x fifoLkey=0x%x", + INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d subnet-prefix %lu FLID %d fifoRkey=0x%x fifoLkey=0x%x", comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", - dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid, devInfo->fifoRkey, commDev->fifoMr->lkey); + dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid, + devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey); } } else { // RoCE for (int q = 0; q < comm->base.nqps; q++) { @@ -1065,7 +1139,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x", comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, (int64_t)commDev->base.gidInfo.localGidIndex, - devInfo->spn, devInfo->iid, devInfo->fifoRkey, commDev->fifoMr->lkey); + devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey); } } } @@ -1114,8 +1188,8 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc. for (int i = 0; i < remMeta.ndevs; i++) { comm->base.remDevs[i] = remMeta.devs[i]; - comm->base.remDevs[i].remoteGid.global.interface_id = comm->base.remDevs[i].iid; - comm->base.remDevs[i].remoteGid.global.subnet_prefix = comm->base.remDevs[i].spn; + comm->base.remDevs[i].remoteGid.global.interface_id = comm->base.remDevs[i].gid.global.interface_id; + comm->base.remDevs[i].remoteGid.global.subnet_prefix = comm->base.remDevs[i].gid.global.subnet_prefix; // Retain remote sizes fifo info and prepare RDMA ops comm->remSizesFifo.rkeys[i] = remMeta.devs[i].fifoRkey; @@ -1135,13 +1209,12 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet comm->base.qps[q].remDevIdx = remQpInfo->devIndex; int devIndex = comm->base.qps[q].devIndex; ncclIbSendCommDev* commDev = comm->devs + devIndex; - uint8_t gidIndex = commDev->base.gidInfo.localGidIndex; struct ibv_qp* qp = comm->base.qps[q].qp; - if (remQpInfo->ece_supported && remQpInfo->ece_supported) + if (remQpInfo->ece_supported) NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported)); - NCCLCHECK(ncclIbRtrQp(qp, gidIndex, remQpInfo->qpn, remDevInfo)); + NCCLCHECK(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false)); NCCLCHECK(ncclIbRtsQp(qp)); } @@ -1237,15 +1310,15 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle ibDevN = mergedDev->devs[i]; NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base)); ibDev = ncclIbDevs + ibDevN; - NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, ibDev->portAttr.gid_tbl_len, &rCommDev->base.gidInfo.localGidIndex)); + NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex)); NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid)); } // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc. for (int i = 0; i < remMeta.ndevs; i++) { rComm->base.remDevs[i] = remMeta.devs[i]; - rComm->base.remDevs[i].remoteGid.global.interface_id = rComm->base.remDevs[i].iid; - rComm->base.remDevs[i].remoteGid.global.subnet_prefix = rComm->base.remDevs[i].spn; + rComm->base.remDevs[i].remoteGid.global.interface_id = rComm->base.remDevs[i].gid.global.interface_id; + rComm->base.remDevs[i].remoteGid.global.subnet_prefix = rComm->base.remDevs[i].gid.global.subnet_prefix; } // Stripe QP creation across merged devs @@ -1270,14 +1343,15 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle // Set the ece (enhanced connection establishment) on this QP before RTR if (remMeta.qpInfo[q].ece_supported) { NCCLCHECK(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported)); - + // Query the reduced ece for this QP (matching enhancements between the requestor and the responder) // Store this in our own qpInfo for returning to the requestor if (meta.qpInfo[q].ece_supported) NCCLCHECK(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported)); } - NCCLCHECK(ncclIbRtrQp(qp->qp, rCommDev->base.gidInfo.localGidIndex, remMeta.qpInfo[q].qpn, remDevInfo)); + bool override_tc = (q == 0) ? true : false; + NCCLCHECK(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc)); NCCLCHECK(ncclIbRtsQp(qp->qp)); } @@ -1307,10 +1381,10 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle devInfo.lid = ibDev->portAttr.lid; devInfo.link_layer = ibDev->portAttr.link_layer; devInfo.ib_port = ibDev->portNum; - devInfo.spn = rCommDev->base.gidInfo.localGid.global.subnet_prefix; - devInfo.iid = rCommDev->base.gidInfo.localGid.global.interface_id; + devInfo.gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix; + devInfo.gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id; devInfo.mtu = ibDev->portAttr.active_mtu; - NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, rCommDev->base.gidInfo.localGidIndex, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo)); + NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false)); NCCLCHECK(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp)); } @@ -1318,8 +1392,8 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle meta.devs[i].lid = ibDev->portAttr.lid; meta.devs[i].link_layer = rCommDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer; meta.devs[i].ib_port = ibDev->portNum; - meta.devs[i].spn = rCommDev->base.gidInfo.localGid.global.subnet_prefix; - meta.devs[i].iid = rCommDev->base.gidInfo.localGid.global.interface_id; + meta.devs[i].gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix; + meta.devs[i].gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id; // Adjust the MTU remMeta.devs[i].mtu = (enum ibv_mtu) std::min(remMeta.devs[i].mtu, ibDev->portAttr.active_mtu); @@ -1906,9 +1980,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { } char line[SOCKET_NAME_MAXLEN+1]; - WARN("NET/IB : Got completion from peer %s with status=%d opcode=%d len=%d vendor err %d (%s)%s%s%s%s", + char *hcaName = r->devBases[i]->pd->context->device->name; + WARN("NET/IB: Got completion from peer %s with status=%d opcode=%d len=%d vendor err %d (%s)%s%s%s%s hca %s", ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type], - localGidStr ? " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString); + localGidStr ? " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString, hcaName); return ncclRemoteError; } @@ -1918,7 +1993,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; - TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%d r=%p type=%d events={%d,%d}, i=%d", + TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%ld r=%p type=%d events={%d,%d}, i=%d", ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i); #endif if (req->type == NCCL_NET_IB_REQ_SEND) { diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc index 0dd7c52ff..61d5946c4 100644 --- a/src/transport/nvls.cc +++ b/src/transport/nvls.cc @@ -12,6 +12,7 @@ #include "proxy.h" #include "enqueue.h" #include "register.h" +#include "transport.h" #if CUDART_VERSION >= 12010 @@ -46,36 +47,13 @@ struct ncclTransport nvlsTransport = { { NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL } }; -ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, size_t size) { - CUmulticastObjectProp* prop = &resources->properties; - memset(prop, 0, sizeof(*prop)); - prop->size = size; - prop->numDevices = comm->MNNVL ? comm->clique.size : comm->localRanks; - prop->handleTypes = ncclCuMemHandleType; - prop->flags = 0; - - // Could be changed to CU_MULTICAST_GRANULARITY_MINIMUM when 3418538 resolved - CUCHECK(cuMulticastGetGranularity(&resources->granularity, prop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); - - ALIGN_SIZE(size, resources->granularity); - prop->size = resources->size = size; - - memset(&resources->accessDesc, 0, sizeof(resources->accessDesc)); - resources->accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - resources->accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - resources->accessDesc.location.id = dev; - resources->dev = dev; - - return ncclSuccess; -} - ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) { CUmemAllocationHandleType type = ncclCuMemHandleType; size_t size = prop->size; // Create a Multicast group - INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zi on rank %d", nranks, size, rank); + INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zu on rank %d", nranks, size, rank); CUCHECK(cuMulticastCreate(mcHandle, prop)); if (type == CU_MEM_HANDLE_TYPE_FABRIC) { @@ -86,14 +64,8 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, memcpy(shareableHandle, mcHandle, sizeof(CUmemGenericAllocationHandle)); } - INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zi on rank %d", *mcHandle, nranks, size, rank); - - return ncclSuccess; -} + INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zu on rank %d", *mcHandle, nranks, size, rank); -ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { - INFO(NCCL_NVLS, "NVLS group %llx adding dev %d", resources->mcHandle, resources->dev); - CUCHECK(cuMulticastAddDevice(resources->mcHandle, resources->dev)); return ncclSuccess; } @@ -123,53 +95,12 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int return ncclSuccess; } -ncclResult_t nvlsGroupDisconnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { - return ncclSuccess; -} - -ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { - size_t size = resources->size; - size_t granularity; - CUdeviceptr ptr = 0; - CUmemAllocationProp prop; - - memset(&prop, 0, sizeof(prop)); - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = resources->dev; - prop.requestedHandleTypes = ncclCuMemHandleType; - CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); - resources->ucGran = granularity; - - // Map a VA for UC memory - CUCHECK(cuMemAddressReserve(&ptr, size, granularity, 0U, 0)); - - // Alloc local physical mem for this NVLS group - CUCHECK(cuMemCreate(&resources->ucHandle, size, &prop, 0)); - CUCHECK(cuMemMap(ptr, size, 0, resources->ucHandle, 0)); - CUCHECK(cuMemSetAccess(ptr, size, &resources->accessDesc, 1)); - CUDACHECK(cudaMemset((void*)ptr, 0, size)); - resources->ucBuff = (char*)ptr; - INFO(NCCL_NVLS, "NVLS Mapped UC at %p size %zi", resources->ucBuff, size); - - // Bind physical memory to the Multicast group - // NB: It will block until all ranks have been added to the Group - INFO(NCCL_NVLS, "NVLS Bind mem %p UC handle 0x%llx MC handle 0x%llx size %zi", (void*)ptr, resources->ucHandle, resources->mcHandle, size); - CUCHECK(cuMulticastBindMem(resources->mcHandle, 0/*mcOffset*/, resources->ucHandle, 0/*memOffset*/, size, 0/*flags*/)); - - return ncclSuccess; -} - -ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { - int dev = resources->dev; - size_t size = resources->size; - INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev); +ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAllocationHandle* mcHandle) { + int dev = comm->cudaDev; + INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zu dev %d", *mcHandle, size, dev); // Unbind physical memory from group for the given device - CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size)); - - // Release the MC group resources - NCCLCHECK(nvlsGroupDisconnect(comm, resources)); + CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size)); return ncclSuccess; } @@ -182,43 +113,18 @@ ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdevi return ncclSuccess; } -ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { - size_t size = resources->size; - CUdeviceptr ptr = 0; - - // Create a VA for the NVLS - CUCHECK(cuMemAddressReserve(&ptr, size, resources->granularity, 0U, 0)); - // Map the VA locally - CUCHECK(cuMemMap(ptr, size, 0, resources->mcHandle, 0)); - resources->mcBuff = (char*)ptr; - INFO(NCCL_NVLS, "NVLS Mapped MC buffer at %p size %zi", resources->mcBuff, size); - - // Having completed the BindMem we can now call SetAccess - // NB: It will block until all ranks have bound to the Group - CUCHECK(cuMemSetAccess((CUdeviceptr)resources->mcBuff, size, &resources->accessDesc, 1)); - - return ncclSuccess; -} - -ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { - size_t size; - CUdeviceptr ptr; - INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", - resources->ucHandle, resources->ucBuff, resources->mcHandle, resources->mcBuff); +ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr, CUmemGenericAllocationHandle* ucHandle, void* mcptr, CUmemGenericAllocationHandle* mcHandle) { + INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr); // Release the UC memory and mapping - ptr = (CUdeviceptr)resources->ucBuff; - size = resources->size; - CUCHECK(cuMemUnmap(ptr, size)); - CUCHECK(cuMemAddressFree(ptr, size)); - CUCHECK(cuMemRelease(resources->ucHandle)); + CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size)); + CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size)); + CUCHECK(cuMemRelease(*ucHandle)); // Release the MC memory and mapping - ptr = (CUdeviceptr)resources->mcBuff; - size = resources->size; - CUCHECK(cuMemUnmap(ptr, size)); - CUCHECK(cuMemAddressFree(ptr, size)); - CUCHECK(cuMemRelease(resources->mcHandle)); + CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size)); + CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size)); + CUCHECK(cuMemRelease(*mcHandle)); return ncclSuccess; } @@ -260,84 +166,222 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) { return ncclSuccess; } -ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { - if (comm->nvlsSupport == 0 || comm->nvlsChannels == 0) return ncclSuccess; +ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) { + ncclResult_t ret = ncclSuccess; + if (comm && comm->nvlsSupport && comm->nNodes > 1) { + for (int c = 0; c < comm->nChannels; c++) { + struct ncclChannel* channel = comm->channels + c; + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail); + } + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_NVLS], 0), ret, fail); + INFO(NCCL_INIT, "Connected NVLS tree"); + } +exit: + return ret; +fail: + goto exit; +} - int nHeads = comm->channels[0].nvls.nHeads; - int headRank = comm->channels[0].nvls.headRank; +static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularity_flags mcOption, const CUmemAccessDesc* desc, size_t* sizePtr, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr) { + char shareableHandle[NVLS_HANDLE_SIZE]; + CUmulticastObjectProp mcprop; + CUmemAllocationProp ucprop; + ncclResult_t ret = ncclSuccess; + size_t size = *sizePtr; + size_t originSize = size; + size_t ucgran, mcgran; + + memset(&mcprop, 0, sizeof(CUmulticastObjectProp)); + mcprop.numDevices = comm->localRanks; + mcprop.handleTypes = ncclCuMemHandleType; + mcprop.flags = 0; + mcprop.size = size; + CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, mcOption), ret, fail); + ALIGN_SIZE(size, mcgran); + *sizePtr = mcprop.size = size; + + if (comm->localRank == 0) { + NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail); + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); + } else { + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); + NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], mcHandle), ret, fail); + } + + CUCHECKGOTO(cuMulticastAddDevice(*mcHandle, comm->cudaDev), ret, fail); + + memset(&ucprop, 0, sizeof(CUmemAllocationProp)); + ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + ucprop.location.id = comm->cudaDev; + ucprop.requestedHandleTypes = ncclCuMemHandleType; + CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); + // Map a VA for UC memory + CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, size, ucgran, 0U, 0), ret, fail); + + // Alloc local physical mem for this NVLS group + CUCHECKGOTO(cuMemCreate(ucHandle, size, &ucprop, 0), ret, fail); + CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, size, 0, *ucHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail); + CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail); + + // Bind physical memory to the Multicast group + // NB: It will block until all ranks have been added to the Group + CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail); + + // Map mc virtual address + CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, size, mcgran, 0U, 0), ret, fail); + CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, size, 0, *mcHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, size, desc, 1), ret, fail); + INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld size %ld (%ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, size, originSize); + +exit: + return ret; +fail: + goto exit; +} + +ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) { + int nHeads = -1; + int headRank = -1; + ncclResult_t res = ncclSuccess; + int nvlsStepSize = -1; + size_t buffSize = 0; + size_t nvlsPerRankSize = 0; + size_t nvlsTotalSize = 0; + struct ncclNvlsSharedRes* resources = NULL; + int nChannels = -1; + + if (comm->nvlsSupport == 0 || comm->nvlsResources->inited) return ncclSuccess; + // initialize after checking comm->nvlsSupport + nHeads = comm->channels[0].nvls.nHeads; + headRank = comm->channels[0].nvls.headRank; + resources = comm->nvlsResources; + nChannels = comm->nvlsResources->nChannels; + nvlsStepSize = comm->nvlsChunkSize; + buffSize = nvlsStepSize * NCCL_STEPS; + nvlsPerRankSize = nChannels * 2 * buffSize; + nvlsTotalSize = nvlsPerRankSize * nHeads; + + INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zu nvlsPerRankSize %zu nvlsTotalSize %zu", + comm, headRank, nHeads, buffSize, nvlsPerRankSize, nvlsTotalSize); + + NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_RECOMMENDED, &resources->accessDesc, &nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff), res, fail); + resources->buffSize = nvlsTotalSize; + + NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail); + for (int h = 0; h < nHeads; h++) { + int nvlsPeer = comm->nRanks + 1 + h; + for (int c = 0; c < nChannels; c++) { + struct ncclChannel* channel = comm->channels + c; + struct ncclChannelPeer* peer = channel->peers[nvlsPeer]; + + // Reduce UC -> MC + peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = resources->ucBuff + (h * 2 * nChannels + c) * buffSize; + peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = resources->mcBuff + (h * 2 * nChannels + c) * buffSize; + + // Broadcast MC -> UC + peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * buffSize; + peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * buffSize; + + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); + } + } + + NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail); + // For now, the barrier is a must that guarantees all buffers are mc-mapped before accessing peer's buffer + NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, fail); + comm->nvlsResources->inited = true; + +exit: + return res; +fail: + comm->nvlsResources->inited = false; + goto exit; +} + +ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { + ncclResult_t res = ncclSuccess; + size_t typeSize; char shmPath[sizeof("/dev/shm/nccl-XXXXXX")]; uintptr_t *nvlsShmem = NULL; - size_t typeSize; + bool nvlsShare = parent && parent->nvlsSupport && parent->config.splitShare; + int nHeads = comm->channels[0].nvls.nHeads; - CUdevice dev; - CUCHECK(cuCtxGetDevice(&dev)); + if (comm->nvlsSupport == 0 || comm->nvlsChannels == 0) return ncclSuccess; - ncclResult_t res = ncclSuccess; - bool nvlsShare = true; - if (parent && parent->nvlsSupport && parent->config.splitShare && parent->localRanks == comm->localRanks) + if (nvlsShare && parent->channels[0].nvls.nHeads == nHeads) { + for (int ch = 0; ch < nHeads; ++ch) { + bool find = false; + for (int h = 0; h < parent->channels[0].nvls.nHeads; ++h) { + if (comm->nvlsHeads[ch] == parent->nvlsHeads[h]) { + // find the head + find = true; + break; + } + } + if (find == false) { + nvlsShare = false; + goto setup; + } + } nvlsShare = true; - else + } else { nvlsShare = false; + } +setup: + comm->nvlsChunkSize = ncclParamNvlsChunkSize(); if (nvlsShare) { /* reuse NVLS resources */ comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels); for (int c = 0; c < comm->nChannels; c++) { - NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, cleanup); + NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, fail); } comm->nvlsResources = parent->nvlsResources; ncclAtomicRefCountIncrement(&parent->nvlsResources->refCount); } else { + struct ncclNvlsSharedRes* resources = NULL; + int nHeads = comm->channels[0].nvls.nHeads; int nChannels = comm->nChannels; - struct ncclNvlsSharedRes* resources; - - NCCLCHECK(ncclCalloc(&resources, 1)); - comm->nvlsResources = resources; - resources->refCount = 1; - - if (parent && parent->config.splitShare) { + size_t memSize = 16; + size_t creditSize = nChannels * 2 * memSize * nHeads; + int nvlsStepSize = comm->nvlsChunkSize; + + NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail); + comm->nvlsResources->inited = false; + comm->nvlsResources->refCount = 1; + comm->nvlsResources->nChannels = comm->nvlsChannels; + resources = comm->nvlsResources; + + if (parent && parent->nvlsSupport && parent->config.splitShare) { /* ranks on other nodes might share the NVLS resources, we need to cap nvlsChannels * to make sure nvlsChannels match for each rank. */ comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels); } + comm->nvlsResources->nChannels = comm->nvlsChannels; - resources->nChannels = comm->nvlsChannels; - for (int c = 0; c < nChannels; c++) { - NCCLCHECK(initNvlsChannel(comm, c, parent, false)); + for (int c = 0; c < comm->nChannels; c++) { + NCCLCHECKGOTO(initNvlsChannel(comm, c, NULL, false), res, fail); } - int nvlsStepSize = comm->nvlsChunkSize = ncclParamNvlsChunkSize(); - size_t buffSize = nvlsStepSize * NCCL_STEPS; - size_t memSize = NVLS_MEM_ALIGN_SIZE; - size_t nvlsPerRankSize = nChannels * 2 * (buffSize + memSize); - size_t nvlsTotalSize = nvlsPerRankSize * nHeads; - - INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi", - comm, headRank, nHeads, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize); - - char* shareableHandle = resources->shareableHandle; - NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, nvlsTotalSize), res, cleanup); - if (comm->localRank == 0) { - NCCLCHECKGOTO(nvlsGroupCreate(comm, &resources->properties, comm->localRank, comm->localRanks, &resources->mcHandle, shareableHandle), res, cleanup); - NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup); - } else { - NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup); - NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &resources->mcHandle), res, cleanup); - } + memset(&resources->accessDesc, 0, sizeof(resources->accessDesc)); + resources->accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + resources->accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + resources->accessDesc.location.id = comm->cudaDev; + resources->dev = comm->cudaDev; - NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup); - NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup); - if (comm->localRanks > 1) { - // Local intra-node barrier to ensure everyone has bound their memory to the group - NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup); - } - if (comm->MNNVL) { - // MNNVL: Clique wide barrier to ensure everyone has bound their memory to the group - NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, comm->clique.ranks[0]), res, cleanup); - } - NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup); + NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_MINIMUM, &resources->accessDesc, &creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit), res, fail); + resources->creditSize = creditSize; + // Set up head and tail only for now + NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail); for (int h = 0; h < nHeads; h++) { int nvlsPeer = comm->nRanks + 1 + h; for (int c = 0; c < nChannels; c++) { @@ -346,77 +390,72 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { struct ncclChannelPeer* peer = channel->peers[nvlsPeer]; // Reduce UC -> MC - mem = resources->ucBuff + (h * 2 * nChannels + c) * (buffSize + memSize); + mem = resources->ucCredit + (h * 2 * nChannels + c) * memSize; peer->send[1].transportComm = &nvlsTransport.send; - peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem; - peer->send[1].conn.head = (uint64_t*)(mem + buffSize); - peer->send[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2); + peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = NULL; + peer->send[1].conn.head = (uint64_t*)mem; + peer->send[1].conn.tail = (uint64_t*)(mem + memSize / 2); peer->send[1].conn.stepSize = nvlsStepSize; - mem = resources->mcBuff + (h * 2 * nChannels + c) * (buffSize + memSize); + mem = resources->mcCredit + (h * 2 * nChannels + c) * memSize; peer->recv[0].transportComm = &nvlsTransport.recv; - peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem; - peer->recv[0].conn.head = (uint64_t*)(mem + buffSize); - peer->recv[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2); + peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = NULL; + peer->recv[0].conn.head = (uint64_t*)mem; + peer->recv[0].conn.tail = (uint64_t*)(mem + memSize / 2); peer->recv[0].conn.stepSize = nvlsStepSize; peer->recv[0].conn.flags |= NCCL_NVLS_MIN_POLL; // Broadcast MC -> UC - mem = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize); + mem = resources->ucCredit + ((h * 2 + 1) * nChannels + c) * memSize; peer->recv[1].transportComm = &nvlsTransport.recv; - peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem; - peer->recv[1].conn.head = (uint64_t*)(mem + buffSize); - peer->recv[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2); + peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = NULL; + peer->recv[1].conn.head = (uint64_t*)mem; + peer->recv[1].conn.tail = (uint64_t*)(mem + memSize / 2); peer->recv[1].conn.stepSize = nvlsStepSize; - mem = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize); + mem = resources->mcCredit + ((h * 2 + 1) * nChannels + c) * memSize; peer->send[0].transportComm = &nvlsTransport.send; - peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem; - peer->send[0].conn.head = (uint64_t*)(mem + buffSize); - peer->send[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2); + peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = NULL; + peer->send[0].conn.head = (uint64_t*)mem; + peer->send[0].conn.tail = (uint64_t*)(mem + memSize / 2); peer->send[0].conn.stepSize = nvlsStepSize; peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL; - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup); - - /*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p", - nvlsPeer, c, - resources->mcBuff + (h*2*nChannels+c)*(buffSize+memSize), - resources->mcBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize), - resources->ucBuff + (h*2*nChannels+c)*(buffSize+memSize), - resources->ucBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize));*/ + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); } } + NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail); } // MNNVL does not support NVLS buffer registration - if (comm->MNNVL) return res; + if (!comm->MNNVL && comm->nvlsResources->nvlsShmemHandle == NULL) { + /* create shared memory for fast NVLS buffer registration */ + typeSize = sizeof(struct localRegData) << 1; - /* create shared memory for fast NVLS buffer registration */ - typeSize = sizeof(struct localRegData) << 1; - - if (comm->localRank == 0) { - shmPath[0] = '\0'; - NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, cleanup); - NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, cleanup); - } else { - NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, cleanup); - NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, cleanup); + if (comm->localRank == 0) { + shmPath[0] = '\0'; + NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail); + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail); + } else { + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail); + NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail); + } + /* need 2 pools and a shared counter for shmem-based collectives */ + comm->nvlsResources->nvlsShmem.cnt[0] = (size_t*)nvlsShmem; + comm->nvlsResources->nvlsShmem.ptr[0] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[0] + sizeof(size_t)); + comm->nvlsResources->nvlsShmem.cnt[1] = (size_t*)((char*)comm->nvlsResources->nvlsShmem.ptr[0] + typeSize * comm->localRanks); + comm->nvlsResources->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[1] + sizeof(size_t)); + comm->nvlsResources->nvlsShmem.round = 0; + comm->nvlsResources->nvlsShmem.maxTypeSize = typeSize; } - /* need 2 pools and a shared counter for shmem-based collectives */ - comm->nvlsResources->nvlsShmem.cnt[0] = (size_t*)nvlsShmem; - comm->nvlsResources->nvlsShmem.ptr[0] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[0] + sizeof(size_t)); - comm->nvlsResources->nvlsShmem.cnt[1] = (size_t*)((char*)comm->nvlsResources->nvlsShmem.ptr[0] + typeSize * comm->localRanks); - comm->nvlsResources->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[1] + sizeof(size_t)); - comm->nvlsResources->nvlsShmem.round = 0; - comm->nvlsResources->nvlsShmem.maxTypeSize = typeSize; +exit: return res; - -cleanup: +fail: comm->nvlsSupport = 0; - return res; + goto exit; } ncclResult_t ncclNvlsFree(struct ncclComm* comm) { @@ -424,9 +463,18 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) { if (resources == NULL) return ncclSuccess; if (ncclAtomicRefCountDecrement(&resources->refCount) == 0) { - NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle)); - NCCLCHECK(nvlsGroupUnbind(comm, resources)); - NCCLCHECK(nvlsGroupUnmapMem(comm, resources)); + if (!comm->MNNVL && resources->nvlsShmemHandle) + NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle)); + + if (resources->ucCredit && resources->mcCredit) { + NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle)); + NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle)); + } + + if (comm->nvlsResources->inited) { + NCCLCHECK(nvlsGroupUnbind(comm, resources->buffSize, &resources->mcBuffHandle)); + NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffSize, resources->ucBuff, &resources->ucBuffHandle, resources->mcBuff, &resources->mcBuffHandle)); + } free(resources); comm->nvlsResources = NULL; } @@ -437,14 +485,15 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t ncclResult_t ret = ncclSuccess; struct ncclReg *regRecord = NULL; CUdeviceptr regPtr = 0; - CUmulticastObjectProp prop; + CUmulticastObjectProp mcprop; + CUmemAllocationProp ucprop; char shareableHandle[NVLS_HANDLE_SIZE]; CUmemGenericAllocationHandle mcHandle; - size_t granularity; size_t minSize = SIZE_MAX; bool localRegBufUsed = false; struct localRegData* regData = NULL; cudaPointerAttributes attr; + size_t ucgran, mcgran; NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks), ret, fail); @@ -454,17 +503,28 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t CUDACHECK(cudaPointerGetAttributes(&attr, (void*)regRecord->addr)); if (attr.type == cudaMemoryTypeDevice) { size_t regSize = regRecord->pages * comm->regCache.pageSize; - prop = comm->nvlsResources->properties; - prop.size = regSize; - CUCHECK(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + memset(&mcprop, 0, sizeof(CUmulticastObjectProp)); + mcprop.numDevices = comm->localRanks; + mcprop.handleTypes = ncclCuMemHandleType; + mcprop.flags = 0; + mcprop.size = regSize; + CUCHECK(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + + memset(&ucprop, 0, sizeof(CUmemAllocationProp)); + ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + ucprop.location.id = comm->cudaDev; + ucprop.requestedHandleTypes = ncclCuMemHandleType; + CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); + CUCHECK(cuMemGetAddressRange((CUdeviceptr*)®Record->baseAddr, ®Record->baseSize, (CUdeviceptr)regRecord->addr)); - if (regSize % granularity == 0) { + if (regSize % mcgran == 0) { regRecord->regSize = regSize; } else { regRecord->regSize = regRecord->baseSize - (regRecord->addr - regRecord->baseAddr); } - if (regRecord->addr % comm->nvlsResources->ucGran == 0 && regRecord->regSize % granularity == 0) { + if (regRecord->addr % ucgran == 0 && regRecord->regSize % mcgran == 0) { regRecord->state |= NVLS_REG_POSSIBLE; memcpy(®Data[comm->localRank].reg, regRecord, sizeof(struct ncclReg)); regData[comm->localRank].offset = userBuff - regRecord->addr; @@ -489,11 +549,10 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t } /* start registration */ - memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp)); - prop.size = minSize; - CUCHECKGOTO(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); + mcprop.size = minSize; + CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); if (comm->localRank == 0) { - NCCLCHECKGOTO(nvlsGroupCreate(comm, &prop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail); + NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); @@ -504,7 +563,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail); // Create a VA for the NVLS - CUCHECKGOTO(cuMemAddressReserve(®Ptr, minSize, granularity, 0U, 0), ret, fail); + CUCHECKGOTO(cuMemAddressReserve(®Ptr, minSize, mcgran, 0U, 0), ret, fail); // Map the VA locally CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail); CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail); @@ -639,14 +698,35 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send goto exit; } -ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { +struct ncclNvlsCleanupCallback { + struct ncclCommCallback base; + CUmemGenericAllocationHandle mcHandle; + CUdeviceptr ptr; + int dev; + size_t size; +}; + +static ncclResult_t cleanupNvls(struct ncclComm* comm, struct ncclCommCallback* cb) { + struct ncclNvlsCleanupCallback* obj = (struct ncclNvlsCleanupCallback*)cb; + NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size)); + INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size); + free(obj); + return ncclSuccess; +} + +ncclResult_t ncclNvlsGraphRegisterBuffer( + struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, + bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, + struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueEltsAdded + ) { ncclResult_t ret = ncclSuccess; bool localRegBufUsed = false; - struct ncclNvlsMcHandleList* sendRecord = NULL; - struct ncclNvlsMcHandleList* recvRecord = NULL; + struct ncclNvlsCleanupCallback* sendRecord = NULL; + struct ncclNvlsCleanupCallback* recvRecord = NULL; CUdeviceptr regSendPtr = 0; CUdeviceptr regRecvPtr = 0; - CUmulticastObjectProp prop; + CUmulticastObjectProp mcprop; + CUmemAllocationProp ucprop; char shareableHandle[NVLS_HANDLE_SIZE]; CUmemGenericAllocationHandle sendMcHandle, recvMcHandle; size_t sendGran = 0, recvGran = 0; @@ -656,6 +736,7 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne const void *baseRecv = NULL; size_t baseSendSize = 1; size_t baseRecvSize = 1; + size_t ucgran; *outRegBufUsed = false; NCCLCHECKGOTO(ncclCalloc(®BufFlags, comm->localRanks), ret, fail); @@ -669,16 +750,27 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne if (recvbuff != NULL) CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff), ret, fail); - localRegBufUsed = ((uint64_t)baseSend % comm->nvlsResources->ucGran != 0 || (uint64_t)baseRecv % comm->nvlsResources->ucGran != 0) ? false : true; + memset(&ucprop, 0, sizeof(CUmemAllocationProp)); + ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + ucprop.location.id = comm->cudaDev; + ucprop.requestedHandleTypes = ncclCuMemHandleType; + CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); + + localRegBufUsed = ((uint64_t)baseSend % ucgran != 0 || (uint64_t)baseRecv % ucgran != 0) ? false : true; regBufFlags[comm->localRank] = localRegBufUsed; NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regBufFlags, sizeof(bool)), ret, fail); for (int i = 0; i < comm->localRanks; ++i) if (regBufFlags[i] == false) goto fail; - memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp)); + memset(&mcprop, 0, sizeof(CUmulticastObjectProp)); + mcprop.numDevices = comm->localRanks; + mcprop.handleTypes = ncclCuMemHandleType; + mcprop.flags = 0; + if (sendbuff != NULL) { - prop.size = baseSendSize; - CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); + mcprop.size = baseSendSize; + CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); /* check send buffer offset and size */ rdata[comm->localRank].offset = (uintptr_t)sendbuff - (uintptr_t)baseSend; @@ -691,11 +783,11 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne } if (baseSendSize % sendGran != 0) goto fail; - prop.size = baseSendSize; + mcprop.size = baseSendSize; /* register sendbuff */ if (comm->localRank == 0) { - NCCLCHECKGOTO(nvlsGroupCreate(comm, &prop, comm->localRank, comm->localRanks, &sendMcHandle, shareableHandle), ret, fail); + NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &sendMcHandle, shareableHandle), ret, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); @@ -711,7 +803,8 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne CUCHECKGOTO(cuMemMap(regSendPtr, baseSendSize, 0, sendMcHandle, 0), ret, fail); CUCHECKGOTO(cuMemSetAccess(regSendPtr, baseSendSize, &comm->nvlsResources->accessDesc, 1), ret, fail); - sendRecord = ncclMemoryPoolAlloc(&comm->memPool_ncclNvlsHandleList, &comm->memPermanent); + sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback)); + sendRecord->base.fn = cleanupNvls; sendRecord->mcHandle = sendMcHandle; sendRecord->ptr = regSendPtr; sendRecord->dev = comm->nvlsResources->dev; @@ -719,8 +812,8 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne } if (recvbuff != NULL) { - prop.size = baseRecvSize; - CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); + mcprop.size = baseRecvSize; + CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); rdata[comm->localRank].offset = (uintptr_t)recvbuff - (uintptr_t)baseRecv; rdata[comm->localRank].size = baseRecvSize; @@ -732,9 +825,9 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne } if (baseRecvSize % recvGran != 0) goto fail; - prop.size = baseRecvSize; + mcprop.size = baseRecvSize; if (comm->localRank == 0) { - NCCLCHECKGOTO(nvlsGroupCreate(comm, &prop, comm->localRank, comm->localRanks, &recvMcHandle, shareableHandle), ret, fail); + NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &recvMcHandle, shareableHandle), ret, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); @@ -750,7 +843,8 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne CUCHECKGOTO(cuMemMap(regRecvPtr, baseRecvSize, 0, recvMcHandle, 0), ret, fail); CUCHECKGOTO(cuMemSetAccess(regRecvPtr, baseRecvSize, &comm->nvlsResources->accessDesc, 1), ret, fail); - recvRecord = ncclMemoryPoolAlloc(&comm->memPool_ncclNvlsHandleList, &comm->memPermanent); + recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback)); + recvRecord->base.fn = cleanupNvls; recvRecord->mcHandle = recvMcHandle; recvRecord->ptr = regRecvPtr; recvRecord->dev = comm->nvlsResources->dev; @@ -764,22 +858,24 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne if (localRegBufUsed == false) { if (sendRecord) { ncclNvlsDeregBuffer(&sendRecord->mcHandle, sendRecord->ptr, sendRecord->dev, sendRecord->size); - ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, sendRecord); + free(sendRecord); } if (recvRecord) { ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size); - ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, recvRecord); + free(recvRecord); } } else { if (sendRecord) { *outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend); - ncclIntruQueueEnqueue(&plan->nvlsMcHandleQueue, sendRecord); + ncclIntruQueueEnqueue(cleanupQueue, &sendRecord->base); + *nCleanupQueueEltsAdded += 1; } if (recvRecord) { *outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv); - ncclIntruQueueEnqueue(&plan->nvlsMcHandleQueue, recvRecord); + ncclIntruQueueEnqueue(cleanupQueue, &recvRecord->base); + *nCleanupQueueEltsAdded += 1; } INFO(NCCL_NVLS, "rank %d successfully graph-registered sendbuff %p, recvbuff %p, sendbuff size %ld (register size %ld, sendGran %ld), recvbuff size %ld (register size %ld, recvGran %ld), reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, baseSendSize, sendGran, recvbuffSize, baseRecvSize, recvGran, (void*)regSendPtr, (void*)regRecvPtr); @@ -806,6 +902,10 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) { return ncclSuccess; } +ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) { + return ncclSuccess; +} + ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { return ncclSuccess; } @@ -814,7 +914,15 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) { return ncclSuccess; } -ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { +ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) { + return ncclSuccess; +} + +ncclResult_t ncclNvlsGraphRegisterBuffer( + struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, + bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, + struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueEltsAdded + ) { *outRegBufUsed = false; return ncclSuccess; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 17a5d69ee..90a714b40 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -9,6 +9,7 @@ #include "utils.h" #include "shm.h" #include "p2p.h" +#include "transport.h" enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM }; @@ -223,7 +224,7 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v CUDACHECK(res); } } - INFO(NCCL_P2P|NCCL_ALLOC, "Allocated shareable buffer %p size %zi ipcDesc %p", *ptr, size, ipcDesc); + INFO(NCCL_P2P|NCCL_ALLOC, "Allocated shareable buffer %p size %zu ipcDesc %p", *ptr, size, ipcDesc); return ncclSuccess; } @@ -256,7 +257,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz CUCHECK(cuMemAddressReserve(&dptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0)); CUCHECK(cuMemMap(dptr, size, /* offset */ 0, handle, /* flags */ 0)); - TRACE(NCCL_P2P, "Imported shareable buffer size %zi handle 0x%llx dptr %p", size, handle, (void*)dptr); + TRACE(NCCL_P2P, "Imported shareable buffer size %zu handle 0x%llx dptr %p", size, handle, (void*)dptr); // Allow access by the local GPU CUmemAccessDesc accessDesc = {}; @@ -264,7 +265,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz accessDesc.location.id = comm->cudaDev; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1)); - TRACE(NCCL_P2P, "Set Access for %p size %zi on dev %d", (void*)dptr, size, accessDesc.location.id); + TRACE(NCCL_P2P, "Set Access for %p size %zu on dev %d", (void*)dptr, size, accessDesc.location.id); *devMemPtr = (void *)dptr; #else @@ -275,7 +276,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz CUDACHECK(cudaIpcOpenMemHandle(devMemPtr, ipcDesc->devIpc, cudaIpcMemLazyEnablePeerAccess)); } - INFO(NCCL_P2P, "Imported shareable buffer device %d size %zi ptr %p", comm->cudaDev, size, *devMemPtr); + INFO(NCCL_P2P, "Imported shareable buffer device %d size %zu ptr %p", comm->cudaDev, size, *devMemPtr); return ncclSuccess; } @@ -318,7 +319,7 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.id = myInfo->cudaDev; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - INFO(NCCL_P2P, "Set Access for buffer %p size %zi on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev); + INFO(NCCL_P2P, "Set Access for buffer %p size %zu on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev); CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1)); } #endif diff --git a/src/transport/shm.cc b/src/transport/shm.cc index 9f2f2fc26..7fc6251b6 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -6,6 +6,7 @@ #include "comm.h" #include "shm.h" +#include "transport.h" struct shmConnectInfo { char shmName[7];