2.22.3-1

Rework core for NVIDIA Trusted Computing * Compress work structs so that they are shared between channels * Utilize the full amount of kernel argument space permitted (4k) before resorting to work fifo. * Rework the task preprocessing phase. * Use a separate abortDevFlag which is kept in sync with abortFlag using cudaMemcpy operations. * Rename src/include/align.h to src/include/bitops.h Add lazy connection establishment for collective operations * Move buffer allocation and connection establishment to the first collective operation using that algorithm. * Accelerate init time and reduce memory usage. * Avoid allocating NVLS buffers if all calls are registered. * Compute algo/proto in ncclLaunchCollTasksInfo early on. * Connect peers in ncclCollPreconnectFunc if not connected already. * Also move shared buffer creation to the first send/recv call. Accelerate intra-node NVLink detection * Make each rank only detect NVLinks attached to its GPU. * Fuse XMLs to reconstruct the full NVLink topology Add init profiling to report time spend in different init phases. * Report timings of bootstrap, allgather, search, connect, etc. * Add new "PROFILE" category for NCCL_DEBUG_SUBSYS. Add support for PCI p2p on split PCI switches * Detect split PCI switches through a kernel module exposing switch information. * Update the topology XML and graph to add those inter-switch connections. Add cost estimation API * Add a new ncclGroupEndSimulate primitive to return the estimated time a group would take. Net/IB: Add separate traffic class for fifo messages * Add NCCL_IB_FIFO_TC to control the traffic class of fifo messages independently from NCCL_IB_TC. Merges PR #1194 Net/IB: Add support for IB router * Use flid instead of lid if subnets do not match * Warn if flid is 0 Optimizations and fixes for device network offload (unpack) * Double the default number of channels * Cache netDeviceType * Fix save/increment head logic to enable Tree support. Support ncclGroupStart/End for ncclCommAbort/Destroy * Allow Abort/Destroy to be called within a group when managing multiple GPUs with a single process. Improve Tuner API * Provide to the plugin the original cost table so that the plugin can leave unknown or disabled algo/proto combinations untouched. * Remove nvlsSupport and collnetSupport. Do not print version to stdout when using a debug file * Also print version from all processes with INFO debug level. Fixes issue #1271 Fix clang warnings in NVTX headers * Update NVTX headers to the latest version Fixes issue #1270 Disable port fusion in heterogeneous systems * Do not fuse ports if a mix of multi-port and single port are detected. Fix NVLS graphs search for dual NICs. * Fix NVLS graph search when we have more than one NIC per GPU. Fix crash with collnetDirect * Add separate graph search for collnetDirect, testing alltoall paths and working similarly to the NVLS search. Fix hang when nodes have different CPU types * Add the CPU type to the rank peer info. * Align all ranks on the CPU type after the first allgather. * Only use the aligned CPU type for all tuning operations. Fixes issue #1136 Fixes issue #1184 Fix performance of registered send/recv operations * Allow for single full size operations * Add INFO to confirm the registration of send/recv buffers. Move all sync ops to finalize stage * Ensure ncclCommDestroy is non-blocking if ncclCommFinalize has been called. Improve error reporting during SHM segment creation Improve support of various compilers Merges PR #1177 Merges PR #1228 Allow net and tuner plugins to be statically linked * Search for ncclNet or ncclTuner symbols in the main binary. Merges PR #979 Plugin examples includes cleanup * Harmonize err.h and common.h usage. * Add mixed plugin with both net and tuner.
NVIDIA · Jun 19, 2024 · 178b6b7 · 178b6b7
1 parent 529ee69
commit 178b6b7
Show file tree

Hide file tree

Showing 115 changed files with 8,595 additions and 4,326 deletions.
diff --git a/ext-net/example/nccl/common.h b/ext-net/example/nccl/common.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
diff --git a/ext-net/example/nccl/err.h b/ext-net/example/nccl/err.h
@@ -11,6 +11,7 @@ typedef enum { ncclSuccess = 0,
  ncclSystemError = 2,
  ncclInternalError = 3,
  ncclInvalidArgument = 4,
+ ncclInvalidUsage = 5,
  ncclRemoteError = 6 } ncclResult_t;
 
 #endif
diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h
@@ -8,6 +8,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 
+#include "common.h"
 #include "err.h"
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
@@ -19,11 +20,6 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
-typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
-
-typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
-
 #include "net_v8.h"
 #include "net_v7.h"
 #include "net_v6.h"

diff --git a/ext-net/example/nccl/types.h b/ext-net/example/nccl/types.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_ERR_H_
-#define NCCL_ERR_H_
+#ifndef NCCL_TYPES_H_
+#define NCCL_TYPES_H_
 
 /* Data types */
 typedef enum { ncclInt8 = 0, ncclChar = 0,

diff --git a/ext-tuner/example/nccl/common.h b/ext-tuner/example/nccl/common.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
diff --git a/ext-tuner/example/nccl/err.h b/ext-tuner/example/nccl/err.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess = 0,
+ ncclUnhandledCudaError = 1,
+ ncclSystemError = 2,
+ ncclInternalError = 3,
+ ncclInvalidArgument = 4,
+ ncclInvalidUsage = 5,
+ ncclRemoteError = 6 } ncclResult_t;
+
+#endif
diff --git a/ext-tuner/example/nccl/tuner.h b/ext-tuner/example/nccl/tuner.h
@@ -8,15 +8,24 @@
 #ifndef NCCL_TUNER_H_
 #define NCCL_TUNER_H_
 
-#include "nccl.h"
+#include <stdint.h>
+#include <stdlib.h>
 
-typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
-
-typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+#include "common.h"
+#include "err.h"
 
 #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
-typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
+typedef enum {
+ ncclFuncBroadcast = 0,
+ ncclFuncReduce = 1,
+ ncclFuncAllGather = 2,
+ ncclFuncReduceScatter = 3,
+ ncclFuncAllReduce = 4,
+ ncclFuncSendRecv = 5,
+ ncclFuncSend = 6,
+ ncclFuncRecv = 7,
+ ncclNumFuncs = 8
+} ncclFunc_t;
 
 #define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
 #define NCCL_ALGO_UNDEF -1
@@ -33,6 +42,8 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed
 #define NCCL_PROTO_LL128 1
 #define NCCL_PROTO_SIMPLE 2
 
+#define NCCL_ALGO_PROTO_IGNORE -1.0
+
 // API to be implemented by external tuner
 typedef struct {
  // Name of the tuner
@@ -52,31 +63,33 @@ typedef struct {
  // - context: tuner context object
  // - collType: collective type , e.g., allreduce, allgather…
  // - nBytes: collective size in bytes
- // - collNetSupport: whether collnet supports this type
- // - nvlsSupport: whether nvlink sharp supports this time
  // - numPipeOps: number of operations in the group
+ // - numAlgo: number of algorithms in collCostTable
+ // - numProto: number of protocols in collCostTable
  //
  // Outputs:
- // - algorithm: selected algorithm to be used for the given collective
- // - protocol: selected protocol to be used for the given collective
  // - nChannels: number of channels (hence SMs) to be used.
  //
+ // InOut:
+ // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+ // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+ //
  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
  // default tuning for the given collective.
  // Also, the plugin is allowed to not set any output, or set only the
  // algorithm and protocol, but not only the algorithm or only the protocol.
  // Unset fields will be set automatically by NCCL.
  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
- int collNetSupport, int nvlsSupport, int numPipeOps,
- int *algorithm, int *protocol, int* nChannels);
+ int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+ int* nChannels);
 
  // Terminates the plugin and cleans up any resources that the plugin allocated.
  // context: tuner context object
  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v2_t;
+} ncclTuner_v3_t;
 
-typedef ncclTuner_v2_t ncclTuner_t;
+typedef ncclTuner_v3_t ncclTuner_t;
 
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
 
 #endif
diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c
@@ -11,14 +11,21 @@
 __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
 
 __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
- int collNetSupport, int nvlsSupport, int numPipeOps,
- int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
+ int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+ int* nChannels) {
+ // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
+ if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
+ collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
+ }
+ *nChannels = 1;
+ return ncclSuccess;
+}
 
 __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
 
 #define PLUGIN_NAME "Example"
 
-const ncclTuner_v2_t ncclTunerPlugin_v2 = {
+const ncclTuner_v3_t ncclTunerPlugin_v3 = {
  .name = PLUGIN_NAME,
  .init = pluginInit,
  .getCollInfo = pluginGetCollInfo,

diff --git a/makefiles/version.mk b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR := 2
-NCCL_MINOR := 21
-NCCL_PATCH := 5
+NCCL_MINOR := 22
+NCCL_PATCH := 3
 NCCL_SUFFIX :=
 PKG_REVISION := 1
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
@@ -201,7 +201,6 @@ ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFrom
 
 ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
  memset(handle, 0, sizeof(ncclBootstrapHandle));
- NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
 
  const char* env = ncclGetEnv("NCCL_COMM_ID");
  if (env) {
@@ -210,7 +209,9 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
  WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
  return ncclInvalidArgument;
  }
+ handle->magic = NCCL_MAGIC;
  } else {
+ NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
  memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
  NCCLCHECK(bootstrapCreateRoot(handle, false));
  }
@@ -626,7 +627,7 @@ ncclResult_t bootstrapClose(void* commState) {
  struct bootstrapState* state = (struct bootstrapState*)commState;
  if (state->unexpectedConnections != NULL) {
  unexpectedFree(state);
- if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) {
+ if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) {
  WARN("Unexpected connections are not empty");
  return ncclInternalError;
  }

diff --git a/src/channel.cc b/src/channel.cc
@@ -7,16 +7,17 @@
 #include "channel.h"
 #include "param.h"
 #include "gdrwrap.h"
+#include "transport.h"
 
 ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
  struct ncclChannel* channel = &comm->channels[channelId];
  if (channel->id != -1) return ncclSuccess;
 
  int nRanks = comm->nRanks;
- int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
+ int nvlsRanks = comm->localRanks;
  int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */;
  channel->id = channelId;
- channel->workFifoSent = 0;
+ channel->workFifoProduced = 0;
 
  struct ncclSharedResources* sharedRes = comm->sharedRes;
 
@@ -74,7 +75,8 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
 
  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
 
- int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
+ int nvlsRanks = comm->localRanks;
+
  if (share) {
  channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
  channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;

diff --git a/src/collectives.cc b/src/collectives.cc
@@ -9,6 +9,69 @@
 #include "enqueue.h"
 #include "nccl.h"
 
+const char* ncclFuncToString(ncclFunc_t fn) {
+ switch (fn) {
+ case ncclFuncAllGather: return "AllGather";
+ case ncclFuncAllReduce: return "AllReduce";
+ case ncclFuncBroadcast: return "Broadcast";
+ case ncclFuncRecv: return "Recv";
+ case ncclFuncReduce: return "Reduce";
+ case ncclFuncReduceScatter: return "ReduceScatter";
+ case ncclFuncSendRecv: return "SendRecv";
+ case ncclFuncSend: return "Send";
+ default: return "Invalid";
+ }
+}
+
+const char* ncclDevRedOpToString(ncclDevRedOp_t op) {
+ switch (op) {
+ case ncclDevSum: return "Sum";
+ case ncclDevProd: return "Prod";
+ case ncclDevMinMax: return "MinMax";
+ case ncclDevPreMulSum: return "PreMulSum";
+ case ncclDevSumPostDiv: return "SumPostDiv";
+ default: return "Unknown";
+ }
+}
+
+const char* ncclDatatypeToString(ncclDataType_t type) {
+ switch (type) {
+ case ncclInt8: return "ncclInt8";
+ case ncclInt32: return "ncclInt32";
+ case ncclUint32: return "ncclUint32";
+ case ncclInt64: return "ncclInt64";
+ case ncclUint64: return "ncclUint64";
+ case ncclFloat16: return "ncclFloat16";
+ case ncclFloat32: return "ncclFloat32";
+ case ncclFloat64: return "ncclFloat64";
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+ case ncclBfloat16: return "ncclBfloat16";
+#endif
+ default: return "Unknown";
+ }
+}
+
+const char* ncclAlgoToString(int algo) {
+ switch (algo) {
+ case NCCL_ALGO_TREE: return "TREE";
+ case NCCL_ALGO_RING: return "RING";
+ case NCCL_ALGO_COLLNET_DIRECT: return "COLLNET_DIRECT";
+ case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
+ case NCCL_ALGO_NVLS: return "NVLS";
+ case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
+ default: return "Unknown";
+ }
+}
+
+const char* ncclProtoToString(int proto) {
+ switch (proto) {
+ case NCCL_PROTO_LL: return "LL";
+ case NCCL_PROTO_LL128: return "LL128";
+ case NCCL_PROTO_SIMPLE: return "SIMPLE";
+ default: return "Unknown";
+ }
+}
+
 NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
  ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,