Skip to content

Commit

Permalink
2.22.3-1
Browse files Browse the repository at this point in the history
Rework core for NVIDIA Trusted Computing
 * Compress work structs so that they are shared between channels
 * Utilize the full amount of kernel argument space permitted (4k)
   before resorting to work fifo.
 * Rework the task preprocessing phase.
 * Use a separate abortDevFlag which is kept in sync with abortFlag
   using cudaMemcpy operations.
 * Rename src/include/align.h to src/include/bitops.h

Add lazy connection establishment for collective operations
 * Move buffer allocation and connection establishment to the first
   collective operation using that algorithm.
 * Accelerate init time and reduce memory usage.
 * Avoid allocating NVLS buffers if all calls are registered.
 * Compute algo/proto in ncclLaunchCollTasksInfo early on.
 * Connect peers in ncclCollPreconnectFunc if not connected already.
 * Also move shared buffer creation to the first send/recv call.

Accelerate intra-node NVLink detection
 * Make each rank only detect NVLinks attached to its GPU.
 * Fuse XMLs to reconstruct the full NVLink topology

Add init profiling to report time spend in different init phases.
 * Report timings of bootstrap, allgather, search, connect, etc.
 * Add new "PROFILE" category for NCCL_DEBUG_SUBSYS.

Add support for PCI p2p on split PCI switches
 * Detect split PCI switches through a kernel module exposing
   switch information.
 * Update the topology XML and graph to add those inter-switch
   connections.

Add cost estimation API
 * Add a new ncclGroupEndSimulate primitive to return the estimated
   time a group would take.

Net/IB: Add separate traffic class for fifo messages
 * Add NCCL_IB_FIFO_TC to control the traffic class of fifo messages
   independently from NCCL_IB_TC.
   Merges PR #1194

Net/IB: Add support for IB router
 * Use flid instead of lid if subnets do not match
 * Warn if flid is 0

Optimizations and fixes for device network offload (unpack)
 * Double the default number of channels
 * Cache netDeviceType
 * Fix save/increment head logic to enable Tree support.

Support ncclGroupStart/End for ncclCommAbort/Destroy
 * Allow Abort/Destroy to be called within a group when managing
   multiple GPUs with a single process.

Improve Tuner API
 * Provide to the plugin the original cost table so that the plugin
   can leave unknown or disabled algo/proto combinations untouched.
 * Remove nvlsSupport and collnetSupport.

Do not print version to stdout when using a debug file
 * Also print version from all processes with INFO debug level.
   Fixes issue #1271

Fix clang warnings in NVTX headers
 * Update NVTX headers to the latest version
   Fixes issue #1270

Disable port fusion in heterogeneous systems
 * Do not fuse ports if a mix of multi-port and single port are detected.

Fix NVLS graphs search for dual NICs.
 * Fix NVLS graph search when we have more than one NIC per GPU.

Fix crash with collnetDirect
 * Add separate graph search for collnetDirect, testing alltoall paths
   and working similarly to the NVLS search.

Fix hang when nodes have different CPU types
 * Add the CPU type to the rank peer info.
 * Align all ranks on the CPU type after the first allgather.
 * Only use the aligned CPU type for all tuning operations.
   Fixes issue #1136
   Fixes issue #1184

Fix performance of registered send/recv operations
 * Allow for single full size operations
 * Add INFO to confirm the registration of send/recv buffers.

Move all sync ops to finalize stage
 * Ensure ncclCommDestroy is non-blocking if ncclCommFinalize has
   been called.

Improve error reporting during SHM segment creation

Improve support of various compilers
   Merges PR #1177
   Merges PR #1228

Allow net and tuner plugins to be statically linked
 * Search for ncclNet or ncclTuner symbols in the main binary.
   Merges PR #979

Plugin examples includes cleanup
 * Harmonize err.h and common.h usage.
 * Add mixed plugin with both net and tuner.
  • Loading branch information
sjeaugey committed Jun 19, 2024
1 parent 529ee69 commit 178b6b7
Show file tree
Hide file tree
Showing 115 changed files with 8,595 additions and 4,326 deletions.
15 changes: 15 additions & 0 deletions ext-net/example/nccl/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

#ifndef COMMON_H_
#define COMMON_H_

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#endif
1 change: 1 addition & 0 deletions ext-net/example/nccl/err.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ typedef enum { ncclSuccess = 0,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6 } ncclResult_t;

#endif
6 changes: 1 addition & 5 deletions ext-net/example/nccl/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <stdint.h>
#include <stdlib.h>

#include "common.h"
#include "err.h"

#define NCCL_NET_HANDLE_MAXSIZE 128
Expand All @@ -19,11 +20,6 @@
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 32

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#include "net_v8.h"
#include "net_v7.h"
#include "net_v6.h"
Expand Down
4 changes: 2 additions & 2 deletions ext-net/example/nccl/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/

#ifndef NCCL_ERR_H_
#define NCCL_ERR_H_
#ifndef NCCL_TYPES_H_
#define NCCL_TYPES_H_

/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
Expand Down
15 changes: 15 additions & 0 deletions ext-tuner/example/nccl/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

#ifndef COMMON_H_
#define COMMON_H_

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#endif
17 changes: 17 additions & 0 deletions ext-tuner/example/nccl/err.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/

#ifndef NCCL_ERR_H_
#define NCCL_ERR_H_

/* Error type for plugins */
typedef enum { ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6 } ncclResult_t;

#endif
43 changes: 28 additions & 15 deletions ext-tuner/example/nccl/tuner.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,24 @@
#ifndef NCCL_TUNER_H_
#define NCCL_TUNER_H_

#include "nccl.h"
#include <stdint.h>
#include <stdlib.h>

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#include "common.h"
#include "err.h"

#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
typedef enum {
ncclFuncBroadcast = 0,
ncclFuncReduce = 1,
ncclFuncAllGather = 2,
ncclFuncReduceScatter = 3,
ncclFuncAllReduce = 4,
ncclFuncSendRecv = 5,
ncclFuncSend = 6,
ncclFuncRecv = 7,
ncclNumFuncs = 8
} ncclFunc_t;

#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_ALGO_UNDEF -1
Expand All @@ -33,6 +42,8 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_SIMPLE 2

#define NCCL_ALGO_PROTO_IGNORE -1.0

// API to be implemented by external tuner
typedef struct {
// Name of the tuner
Expand All @@ -52,31 +63,33 @@ typedef struct {
// - context: tuner context object
// - collType: collective type , e.g., allreduce, allgather…
// - nBytes: collective size in bytes
// - collNetSupport: whether collnet supports this type
// - nvlsSupport: whether nvlink sharp supports this time
// - numPipeOps: number of operations in the group
// - numAlgo: number of algorithms in collCostTable
// - numProto: number of protocols in collCostTable
//
// Outputs:
// - algorithm: selected algorithm to be used for the given collective
// - protocol: selected protocol to be used for the given collective
// - nChannels: number of channels (hence SMs) to be used.
//
// InOut:
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
//
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
// default tuning for the given collective.
// Also, the plugin is allowed to not set any output, or set only the
// algorithm and protocol, but not only the algorithm or only the protocol.
// Unset fields will be set automatically by NCCL.
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels);
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
int* nChannels);

// Terminates the plugin and cleans up any resources that the plugin allocated.
// context: tuner context object
ncclResult_t (*destroy)(void* context);
} ncclTuner_v2_t;
} ncclTuner_v3_t;

typedef ncclTuner_v2_t ncclTuner_t;
typedef ncclTuner_v3_t ncclTuner_t;

#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"

#endif
13 changes: 10 additions & 3 deletions ext-tuner/example/plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,21 @@
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }

__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
int* nChannels) {
// Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
}
*nChannels = 1;
return ncclSuccess;
}

__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }

#define PLUGIN_NAME "Example"

const ncclTuner_v2_t ncclTunerPlugin_v2 = {
const ncclTuner_v3_t ncclTunerPlugin_v3 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.getCollInfo = pluginGetCollInfo,
Expand Down
4 changes: 2 additions & 2 deletions makefiles/version.mk
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 21
NCCL_PATCH := 5
NCCL_MINOR := 22
NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1
5 changes: 3 additions & 2 deletions src/bootstrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFrom

ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
memset(handle, 0, sizeof(ncclBootstrapHandle));
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));

const char* env = ncclGetEnv("NCCL_COMM_ID");
if (env) {
Expand All @@ -210,7 +209,9 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
handle->magic = NCCL_MAGIC;
} else {
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
NCCLCHECK(bootstrapCreateRoot(handle, false));
}
Expand Down Expand Up @@ -626,7 +627,7 @@ ncclResult_t bootstrapClose(void* commState) {
struct bootstrapState* state = (struct bootstrapState*)commState;
if (state->unexpectedConnections != NULL) {
unexpectedFree(state);
if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) {
if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) {
WARN("Unexpected connections are not empty");
return ncclInternalError;
}
Expand Down
8 changes: 5 additions & 3 deletions src/channel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,17 @@
#include "channel.h"
#include "param.h"
#include "gdrwrap.h"
#include "transport.h"

ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
struct ncclChannel* channel = &comm->channels[channelId];
if (channel->id != -1) return ncclSuccess;

int nRanks = comm->nRanks;
int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
int nvlsRanks = comm->localRanks;
int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */;
channel->id = channelId;
channel->workFifoSent = 0;
channel->workFifoProduced = 0;

struct ncclSharedResources* sharedRes = comm->sharedRes;

Expand Down Expand Up @@ -74,7 +75,8 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo

NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));

int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
int nvlsRanks = comm->localRanks;

if (share) {
channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;
Expand Down
63 changes: 63 additions & 0 deletions src/collectives.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,69 @@
#include "enqueue.h"
#include "nccl.h"

const char* ncclFuncToString(ncclFunc_t fn) {
switch (fn) {
case ncclFuncAllGather: return "AllGather";
case ncclFuncAllReduce: return "AllReduce";
case ncclFuncBroadcast: return "Broadcast";
case ncclFuncRecv: return "Recv";
case ncclFuncReduce: return "Reduce";
case ncclFuncReduceScatter: return "ReduceScatter";
case ncclFuncSendRecv: return "SendRecv";
case ncclFuncSend: return "Send";
default: return "Invalid";
}
}

const char* ncclDevRedOpToString(ncclDevRedOp_t op) {
switch (op) {
case ncclDevSum: return "Sum";
case ncclDevProd: return "Prod";
case ncclDevMinMax: return "MinMax";
case ncclDevPreMulSum: return "PreMulSum";
case ncclDevSumPostDiv: return "SumPostDiv";
default: return "Unknown";
}
}

const char* ncclDatatypeToString(ncclDataType_t type) {
switch (type) {
case ncclInt8: return "ncclInt8";
case ncclInt32: return "ncclInt32";
case ncclUint32: return "ncclUint32";
case ncclInt64: return "ncclInt64";
case ncclUint64: return "ncclUint64";
case ncclFloat16: return "ncclFloat16";
case ncclFloat32: return "ncclFloat32";
case ncclFloat64: return "ncclFloat64";
#if defined(__CUDA_BF16_TYPES_EXIST__)
case ncclBfloat16: return "ncclBfloat16";
#endif
default: return "Unknown";
}
}

const char* ncclAlgoToString(int algo) {
switch (algo) {
case NCCL_ALGO_TREE: return "TREE";
case NCCL_ALGO_RING: return "RING";
case NCCL_ALGO_COLLNET_DIRECT: return "COLLNET_DIRECT";
case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
case NCCL_ALGO_NVLS: return "NVLS";
case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
default: return "Unknown";
}
}

const char* ncclProtoToString(int proto) {
switch (proto) {
case NCCL_PROTO_LL: return "LL";
case NCCL_PROTO_LL128: return "LL128";
case NCCL_PROTO_SIMPLE: return "SIMPLE";
default: return "Unknown";
}
}

NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
Expand Down
Loading

0 comments on commit 178b6b7

Please sign in to comment.